diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/dev.c | 206 | ||||
-rw-r--r-- | net/core/devlink.c | 594 | ||||
-rw-r--r-- | net/core/dst.c | 14 | ||||
-rw-r--r-- | net/core/ethtool.c | 14 | ||||
-rw-r--r-- | net/core/filter.c | 63 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 69 | ||||
-rw-r--r-- | net/core/gen_stats.c | 9 | ||||
-rw-r--r-- | net/core/neighbour.c | 1 | ||||
-rw-r--r-- | net/core/net-procfs.c | 4 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 6 | ||||
-rw-r--r-- | net/core/net_namespace.c | 8 | ||||
-rw-r--r-- | net/core/pktgen.c | 15 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 322 | ||||
-rw-r--r-- | net/core/skbuff.c | 14 | ||||
-rw-r--r-- | net/core/sock.c | 56 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 4 | ||||
-rw-r--r-- | net/core/xdp.c | 73 |
18 files changed, 1213 insertions, 261 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 1fd0a9c88b1b..6dbbba8c57ae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o + fib_notifier.o xdp.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o diff --git a/net/core/dev.c b/net/core/dev.c index 0e0ba36eeac9..94435cd09072 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1554,6 +1554,23 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +/** + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device + * @dev: device + * + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. + */ +static void dev_disable_gro_hw(struct net_device *dev) +{ + dev->wanted_features &= ~NETIF_F_GRO_HW; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); +} + static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { @@ -2815,7 +2832,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path))) + if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; @@ -3054,7 +3071,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, } EXPORT_SYMBOL(skb_csum_hwoffload_help); -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; @@ -3078,9 +3095,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device __skb_linearize(skb)) goto out_kfree_skb; - if (validate_xmit_xfrm(skb, features)) - goto out_kfree_skb; - /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@ -3097,6 +3111,8 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device } } + skb = validate_xmit_xfrm(skb, features, again); + return skb; out_kfree_skb: @@ -3106,7 +3122,7 @@ out_null: return NULL; } -struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; @@ -3117,7 +3133,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d /* in case skb wont be segmented, point to itself */ skb->prev = skb; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; @@ -3174,6 +3190,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3204,9 +3235,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); - } else - qdisc_run_end(q); + } + qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; @@ -3216,6 +3247,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); @@ -3388,8 +3420,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, else queue_index = __netdev_pick_tx(dev, skb); - if (!accel_priv) - queue_index = netdev_cap_txqueue(dev, queue_index); + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); @@ -3428,6 +3459,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; + bool again = false; skb_reset_mac_header(skb); @@ -3489,7 +3521,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) XMIT_RECURSION_LIMIT)) goto recursion_alert; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; @@ -3885,9 +3917,33 @@ drop: return NET_RX_DROP; } +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; struct xdp_buff xdp; void *orig_data; @@ -3931,6 +3987,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; + rxqueue = netif_get_rxqueue(skb); + xdp.rxq = &rxqueue->xdp_rxq; + act = bpf_prog_run_xdp(xdp_prog, &xdp); off = xdp.data - orig_data; @@ -4155,21 +4214,26 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); } } + + xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) @@ -4557,6 +4621,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) } else if (new && !old) { static_key_slow_inc(&generic_xdp_needed); dev_disable_lro(dev); + dev_disable_gro_hw(dev); } break; @@ -7085,17 +7150,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) +void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + struct netdev_bpf *xdp) { - struct netdev_bpf xdp; - - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; + memset(xdp, 0, sizeof(*xdp)); + xdp->command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); - if (prog_id) - *prog_id = xdp.prog_id; + WARN_ON(bpf_op(dev, xdp) < 0); +} + +static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +{ + struct netdev_bpf xdp; + + __dev_xdp_query(dev, bpf_op, &xdp); return xdp.prog_attached; } @@ -7118,6 +7187,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, return bpf_op(dev, &xdp); } +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + /* Remove generic XDP */ + WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + + /* Remove from the driver */ + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return; + + __dev_xdp_query(dev, ndo_bpf, &xdp); + if (xdp.prog_attached == XDP_ATTACHED_NONE) + return; + + /* Program removal should always succeed */ + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -7146,10 +7236,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, bpf_op)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, @@ -7248,6 +7338,7 @@ static void rollback_registered_many(struct list_head *head) /* Shutdown queueing discipline. */ dev_shutdown(dev); + dev_xdp_uninstall(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -7391,6 +7482,18 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~dev->gso_partial_features; } + if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + return features; } @@ -7524,12 +7627,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); @@ -7539,11 +7642,38 @@ static int netif_alloc_rx_queues(struct net_device *dev) dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kvfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); + + kvfree(dev->_rx); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -8104,12 +8234,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif alloc_size = sizeof(struct net_device); if (sizeof_priv) { @@ -8166,12 +8294,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif strcpy(dev->name, name); dev->name_assign_type = name_assign_type; @@ -8207,13 +8333,10 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kvfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); @@ -8226,12 +8349,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; - prog = rcu_dereference_protected(dev->xdp_prog, 1); - if (prog) { - bpf_prog_put(prog); - static_key_slow_dec(&generic_xdp_needed); - } - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); @@ -8819,6 +8936,9 @@ static int __init net_dev_init(void) skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); +#ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +#endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS diff --git a/net/core/devlink.c b/net/core/devlink.c index 7d430c1d9c3e..dd7d6dd07bfb 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -/* devlink_port_mutex - * - * Shared lock to guard lists of ports in all devlink devices. - */ -static DEFINE_MUTEX(devlink_port_mutex); - static struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); @@ -335,15 +329,18 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) -#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3) - /* port is not needed but we need to ensure they don't - * change in the middle of command - */ + +/* The per devlink instance lock is taken by default in the pre-doit + * operation, yet several commands do not require this. The global + * devlink lock is taken and protects from disruption by user-calls. + */ +#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink; + int err; mutex_lock(&devlink_mutex); devlink = devlink_get_from_info(info); @@ -351,44 +348,47 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); } + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_lock(&devlink->lock); if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { info->user_ptr[0] = devlink; } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { struct devlink_port *devlink_port; - mutex_lock(&devlink_port_mutex); devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_port); + err = PTR_ERR(devlink_port); + goto unlock; } info->user_ptr[0] = devlink_port; } - if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) { - mutex_lock(&devlink_port_mutex); - } if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { struct devlink_sb *devlink_sb; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_sb); + err = PTR_ERR(devlink_sb); + goto unlock; } info->user_ptr[1] = devlink_sb; } return 0; + +unlock: + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); + return err; } static void devlink_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT || - ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) - mutex_unlock(&devlink_port_mutex); + struct devlink *devlink; + + devlink = devlink_get_from_info(info); + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -614,10 +614,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; @@ -628,13 +628,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -801,6 +803,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { idx++; @@ -811,10 +814,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -935,14 +941,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -1123,22 +1133,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_port_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -1347,23 +1359,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_tc_pool_bind_get) continue; + + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -1679,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, table->counters_enabled)) goto nla_put_failure; + if (table->resource_valid) { + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units, DEVLINK_ATTR_PAD); + } if (devlink_dpipe_matches_put(table, skb)) goto nla_put_failure; @@ -2273,6 +2294,272 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, counters_enable); } +struct devlink_resource * +devlink_resource_find(struct devlink *devlink, + struct devlink_resource *resource, u64 resource_id) +{ + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + struct devlink_resource *child_resource; + + if (resource->id == resource_id) + return resource; + + child_resource = devlink_resource_find(devlink, resource, + resource_id); + if (child_resource) + return child_resource; + } + return NULL; +} + +void devlink_resource_validate_children(struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + bool size_valid = true; + u64 parts_size = 0; + + if (list_empty(&resource->resource_list)) + goto out; + + list_for_each_entry(child_resource, &resource->resource_list, list) + parts_size += child_resource->size_new; + + if (parts_size > resource->size) + size_valid = false; +out: + resource->size_valid = size_valid; +} + +static int devlink_nl_cmd_resource_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + u64 resource_id; + u64 size; + int err; + + if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] || + !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]) + return -EINVAL; + resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) + return -EINVAL; + + if (!resource->resource_ops->size_validate) + return -EINVAL; + + size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); + err = resource->resource_ops->size_validate(devlink, size, + info->extack); + if (err) + return err; + + resource->size_new = size; + devlink_resource_validate_children(resource); + if (resource->parent) + devlink_resource_validate_children(resource->parent); + return 0; +} + +static void +devlink_resource_size_params_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + struct devlink_resource_size_params *size_params; + + size_params = resource->size_params; + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min, DEVLINK_ATTR_PAD); + nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit); +} + +static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, + struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + struct nlattr *child_resource_attr; + struct nlattr *resource_attr; + + resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE); + if (!resource_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, + DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (resource->size != resource->size_new) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD); + if (resource->resource_ops && resource->resource_ops->occ_get) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->resource_ops->occ_get(devlink), + DEVLINK_ATTR_PAD); + devlink_resource_size_params_put(resource, skb); + if (list_empty(&resource->resource_list)) + goto out; + + if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, + resource->size_valid)) + goto nla_put_failure; + + child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!child_resource_attr) + goto nla_put_failure; + + list_for_each_entry(child_resource, &resource->resource_list, list) { + if (devlink_resource_put(devlink, skb, child_resource)) + goto resource_put_failure; + } + + nla_nest_end(skb, child_resource_attr); +out: + nla_nest_end(skb, resource_attr); + return 0; + +resource_put_failure: + nla_nest_cancel(skb, child_resource_attr); +nla_put_failure: + nla_nest_cancel(skb, resource_attr); + return -EMSGSIZE; +} + +static int devlink_resource_fill(struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + struct nlattr *resources_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + resource = list_first_entry(&devlink->resource_list, + struct devlink_resource, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + + resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!resources_attr) + goto nla_put_failure; + + incomplete = false; + i = 0; + list_for_each_entry_from(resource, &devlink->resource_list, list) { + err = devlink_resource_put(devlink, skb, resource); + if (err) { + if (!i) + goto err_resource_put; + incomplete = true; + break; + } + i++; + } + nla_nest_end(skb, resources_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_resource_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->resource_list)) + return -EOPNOTSUPP; + + return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); +} + +static int +devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) +{ + struct list_head *resource_list; + int err = 0; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + if (!resource->size_valid) + return -EINVAL; + err = devlink_resources_validate(devlink, resource, info); + if (err) + return err; + } + return err; +} + +static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + int err; + + if (!devlink->ops->reload) + return -EOPNOTSUPP; + + err = devlink_resources_validate(devlink, NULL, info); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); + return err; + } + return devlink->ops->reload(devlink); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2291,6 +2578,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, }; static const struct genl_ops devlink_nl_ops[] = { @@ -2322,14 +2611,16 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_port_split_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .doit = devlink_nl_cmd_port_unsplit_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_SB_GET, @@ -2397,8 +2688,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, @@ -2406,8 +2696,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_ESWITCH_GET, @@ -2451,6 +2740,28 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_RESOURCE_SET, + .doit = devlink_nl_cmd_resource_set, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .doit = devlink_nl_cmd_resource_dump, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RELOAD, + .doit = devlink_nl_cmd_reload, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -2488,6 +2799,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); + mutex_init(&devlink->lock); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -2550,16 +2863,16 @@ int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index) { - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); if (devlink_port_index_exists(devlink, port_index)) { - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); return -EEXIST; } devlink_port->devlink = devlink; devlink_port->index = port_index; devlink_port->registered = true; list_add_tail(&devlink_port->list, &devlink->port_list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -2572,10 +2885,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register); */ void devlink_port_unregister(struct devlink_port *devlink_port) { + struct devlink *devlink = devlink_port->devlink; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); list_del(&devlink_port->list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -2651,7 +2966,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, struct devlink_sb *devlink_sb; int err = 0; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); if (devlink_sb_index_exists(devlink, sb_index)) { err = -EEXIST; goto unlock; @@ -2670,7 +2985,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, devlink_sb->egress_tc_count = egress_tc_count; list_add_tail(&devlink_sb->list, &devlink->sb_list); unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return err; } EXPORT_SYMBOL_GPL(devlink_sb_register); @@ -2679,11 +2994,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) { struct devlink_sb *devlink_sb; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink_sb = devlink_sb_get_by_index(devlink, sb_index); WARN_ON(!devlink_sb); list_del(&devlink_sb->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree(devlink_sb); } EXPORT_SYMBOL_GPL(devlink_sb_unregister); @@ -2699,9 +3014,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister); int devlink_dpipe_headers_register(struct devlink *devlink, struct devlink_dpipe_headers *dpipe_headers) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = dpipe_headers; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); @@ -2715,9 +3030,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); */ void devlink_dpipe_headers_unregister(struct devlink *devlink) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = NULL; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); @@ -2783,9 +3098,9 @@ int devlink_dpipe_table_register(struct devlink *devlink, table->priv = priv; table->counter_control_extern = counter_control_extern; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); @@ -2801,20 +3116,181 @@ void devlink_dpipe_table_unregister(struct devlink *devlink, { struct devlink_dpipe_table *table; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name); if (!table) goto unlock; list_del_rcu(&table->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree_rcu(table, rcu); return; unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); +/** + * devlink_resource_register - devlink resource register + * + * @devlink: devlink + * @resource_name: resource's name + * @top_hierarchy: top hierarchy + * @reload_required: reload is required for new configuration to + * apply + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_reosurce_id: resource's parent id + * @size params: size parameters + * @resource_ops: resource ops + */ +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops) +{ + struct devlink_resource *resource; + struct list_head *resource_list; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (resource) { + err = -EINVAL; + goto out; + } + + resource = kzalloc(sizeof(*resource), GFP_KERNEL); + if (!resource) { + err = -ENOMEM; + goto out; + } + + if (top_hierarchy) { + resource_list = &devlink->resource_list; + } else { + struct devlink_resource *parent_resource; + + parent_resource = devlink_resource_find(devlink, NULL, + parent_resource_id); + if (parent_resource) { + resource_list = &parent_resource->resource_list; + resource->parent = parent_resource; + } else { + err = -EINVAL; + goto out; + } + } + + resource->name = resource_name; + resource->size = resource_size; + resource->size_new = resource_size; + resource->id = resource_id; + resource->resource_ops = resource_ops; + resource->size_valid = true; + resource->size_params = size_params; + INIT_LIST_HEAD(&resource->resource_list); + list_add_tail(&resource->list, resource_list); +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_register); + +/** + * devlink_resources_unregister - free all resources + * + * @devlink: devlink + * @resource: resource + */ +void devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ + struct devlink_resource *tmp, *child_resource; + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + if (!resource) + mutex_lock(&devlink->lock); + + list_for_each_entry_safe(child_resource, tmp, resource_list, list) { + devlink_resources_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } + + if (!resource) + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_resources_unregister); + +/** + * devlink_resource_size_get - get and update size + * + * @devlink: devlink + * @resource_id: the requested resource id + * @p_resource_size: ptr to update + */ +int devlink_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size) +{ + struct devlink_resource *resource; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) { + err = -EINVAL; + goto out; + } + *p_resource_size = resource->size_new; + resource->size = resource->size_new; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_size_get); + +/** + * devlink_dpipe_table_resource_set - set the resource id + * + * @devlink: devlink + * @table_name: table name + * @resource_id: resource id + * @resource_units: number of resource's units consumed per table's entry + */ +int devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + struct devlink_dpipe_table *table; + int err = 0; + + mutex_lock(&devlink->lock); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) { + err = -EINVAL; + goto out; + } + table->resource_id = resource_id; + table->resource_units = resource_units; + table->resource_valid = true; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/dst.c b/net/core/dst.c index 662a2d4a3d19..007aa0b08291 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> +#include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -62,15 +63,12 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { - dst->child = NULL; dst->dev = dev; if (dev) dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; - dst->path = dst; - dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -88,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } @@ -116,12 +113,17 @@ EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { - struct dst_entry *child; + struct dst_entry *child = NULL; smp_rmb(); - child = dst->child; +#ifdef CONFIG_XFRM + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + child = xdst->child; + } +#endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 8225416911ae..107b122c8969 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -73,6 +73,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LLTX_BIT] = "tx-lockless", [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", [NETIF_F_LRO_BIT] = "rx-lro", [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", @@ -1692,14 +1693,23 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam; + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; - if (!dev->ethtool_ops->set_ringparam) + if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; + dev->ethtool_ops->get_ringparam(dev, &max); + + /* ensure new ring parameters are within the maximums */ + if (ringparam.rx_pending > max.rx_max_pending || + ringparam.rx_mini_pending > max.rx_mini_max_pending || + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || + ringparam.tx_pending > max.tx_max_pending) + return -EINVAL; + return dev->ethtool_ops->set_ringparam(dev, &ringparam); } diff --git a/net/core/filter.c b/net/core/filter.c index d339ef170df6..db2ee8c7e1bd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2682,8 +2682,9 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) return 0; } -int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int xdp_do_generic_redirect_map(struct net_device *dev, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; @@ -3011,6 +3012,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -3024,8 +3027,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; } return 0; @@ -4301,6 +4302,24 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, + queue_index)); + break; } return insn - insn_buf; @@ -4435,6 +4454,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + +/* Helper macro for adding read access to tcp_sock fields. */ +#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + offsetof(struct tcp_sock, FIELD_NAME)); \ + } while (0) + + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP32(snd_cwnd); + break; + + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP32(srtt_us); + break; } return insn - insn_buf; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15ce30063765..02db7b122a73 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -24,6 +24,7 @@ #include <linux/tcp.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> +#include <uapi/linux/batadv_packet.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -133,10 +134,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, ctrl->addr_type = type; } -static void -__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container) +void +skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; @@ -212,6 +213,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->dst = key->tp_dst; } } +EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, @@ -436,6 +438,57 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, return FLOW_DISSECT_RET_PROTO_AGAIN; } +/** + * __skb_flow_dissect_batadv() - dissect batman-adv header + * @skb: sk_buff to with the batman-adv header + * @key_control: flow dissectors control key + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @p_proto: pointer used to update the protocol to process next + * @p_nhoff: pointer used to update inner network header offset + * @hlen: packet header length + * @flags: any combination of FLOW_DISSECTOR_F_* + * + * ETH_P_BATMAN packets are tried to be dissected. Only + * &struct batadv_unicast packets are actually processed because they contain an + * inner ethernet header and are usually followed by actual network header. This + * allows the flow dissector to continue processing the packet. + * + * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, + * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, + * otherwise FLOW_DISSECT_RET_OUT_BAD + */ +static enum flow_dissect_ret +__skb_flow_dissect_batadv(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + void *data, __be16 *p_proto, int *p_nhoff, int hlen, + unsigned int flags) +{ + struct { + struct batadv_unicast_packet batadv_unicast; + struct ethhdr eth; + } *hdr, _hdr; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) + return FLOW_DISSECT_RET_OUT_BAD; + + *p_proto = hdr->eth.h_proto; + *p_nhoff += sizeof(*hdr); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -576,9 +629,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - __skb_flow_dissect_tunnel_info(skb, flow_dissector, - target_container); - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); @@ -817,6 +867,11 @@ proto_again: nhoff, hlen); break; + case htons(ETH_P_BATMAN): + fdret = __skb_flow_dissect_batadv(skb, key_control, data, + &proto, &nhoff, hlen, flags); + break; + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 87f28557b329..b2b2323bdc84 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7f831711b6e0..7b7a14abba28 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2862,7 +2862,6 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) }; static const struct file_operations neigh_stat_seq_fops = { - .owner = THIS_MODULE, .open = neigh_stat_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 615ccab55f38..e010bb800d7b 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -182,7 +182,6 @@ static int dev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, .open = dev_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -202,7 +201,6 @@ static int softnet_seq_open(struct inode *inode, struct file *file) } static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, .open = softnet_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -306,7 +304,6 @@ static int ptype_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, .open = ptype_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -387,7 +384,6 @@ static int dev_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, .open = dev_mc_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 799b75268291..7bf8b85ade16 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -961,7 +961,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1367,7 +1367,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1558,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!atomic_read(&dev_net(ndev)->count)) + if (!refcount_read(&dev_net(ndev)->count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 60a71be75aea..2213d45fcafd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -35,7 +35,7 @@ LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); struct net init_net = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), }; EXPORT_SYMBOL(init_net); @@ -224,10 +224,10 @@ int peernet2id_alloc(struct net *net, struct net *peer) bool alloc; int id; - if (atomic_read(&net->count) == 0) + if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); - alloc = atomic_read(&peer->count) == 0 ? false : true; + alloc = refcount_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) @@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) int error = 0; LIST_HEAD(net_exit_list); - atomic_set(&net->count, 1); + refcount_set(&net->count, 1); refcount_set(&net->passive, 1); net->dev_base_seq = 1; net->user_ns = user_ns; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f95a15086225..4fcfcb14e7c6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -399,7 +399,7 @@ struct pktgen_dev { __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ __u32 spi; - struct dst_entry dst; + struct xfrm_dst xdst; struct dst_ops dstops; #endif char result[512]; @@ -523,7 +523,6 @@ static int pgctrl_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_fops = { - .owner = THIS_MODULE, .open = pgctrl_open, .read = seq_read, .llseek = seq_lseek, @@ -1804,7 +1803,6 @@ static int pktgen_if_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_if_fops = { - .owner = THIS_MODULE, .open = pktgen_if_open, .read = seq_read, .llseek = seq_lseek, @@ -1942,7 +1940,6 @@ static int pktgen_thread_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_thread_fops = { - .owner = THIS_MODULE, .open = pktgen_thread_open, .read = seq_read, .llseek = seq_lseek, @@ -2609,7 +2606,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) * supports both transport/tunnel mode + ESP/AH type. */ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) - skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); @@ -3742,10 +3739,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) * performance under such circumstance. */ pkt_dev->dstops.family = AF_INET; - pkt_dev->dst.dev = pkt_dev->odev; - dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); - pkt_dev->dst.child = &pkt_dev->dst; - pkt_dev->dst.ops = &pkt_dev->dstops; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 778d7f03404a..16d644a4f974 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -62,7 +62,9 @@ struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; + struct module *owner; unsigned int flags; + struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); @@ -127,8 +129,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; -static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -144,72 +145,127 @@ static inline int rtm_msgindex(int msgtype) return msgindex; } -/** - * __rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) +static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) +{ + struct rtnl_link **tab; + + if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) + protocol = PF_UNSPEC; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); + if (!tab) + tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); + + return tab[msgtype]; +} + +static int rtnl_register_internal(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) { - struct rtnl_link *tab; + struct rtnl_link *link, *old; + struct rtnl_link __rcu **tab; int msgindex; + int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]); + rtnl_lock(); + tab = rtnl_msg_handlers[protocol]; if (tab == NULL) { - tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); - if (tab == NULL) - return -ENOBUFS; + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); + if (!tab) + goto unlock; + /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } + old = rtnl_dereference(tab[msgindex]); + if (old) { + link = kmemdup(old, sizeof(*old), GFP_KERNEL); + if (!link) + goto unlock; + } else { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + goto unlock; + } + + WARN_ON(link->owner && link->owner != owner); + link->owner = owner; + + WARN_ON(doit && link->doit && link->doit != doit); if (doit) - tab[msgindex].doit = doit; + link->doit = doit; + WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) - tab[msgindex].dumpit = dumpit; - tab[msgindex].flags |= flags; + link->dumpit = dumpit; - return 0; + link->flags |= flags; + + /* publish protocol:msgtype */ + rcu_assign_pointer(tab[msgindex], link); + ret = 0; + if (old) + kfree_rcu(old, rcu); +unlock: + rtnl_unlock(); + return ret; +} + +/** + * rtnl_register_module - Register a rtnetlink message type + * + * @owner: module registering the hook (THIS_MODULE) + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Like rtnl_register, but for use by removable modules. + */ +int rtnl_register_module(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + return rtnl_register_internal(owner, protocol, msgtype, + doit, dumpit, flags); } -EXPORT_SYMBOL_GPL(__rtnl_register); +EXPORT_SYMBOL_GPL(rtnl_register_module); /** * rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. * - * Identical to __rtnl_register() but panics on failure. This is useful - * as failure of this function is very unlikely, it can only happen due - * to lack of memory when allocating the chain to store all message - * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continuing. + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { - if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0) - panic("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", - protocol, msgtype); + int err; + + err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, + flags); + if (err) + pr_err("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", protocol, msgtype); } -EXPORT_SYMBOL_GPL(rtnl_register); /** * rtnl_unregister - Unregister a rtnetlink message type @@ -220,24 +276,25 @@ EXPORT_SYMBOL_GPL(rtnl_register); */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); - if (!handlers) { + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { rtnl_unlock(); return -ENOENT; } - handlers[msgindex].doit = NULL; - handlers[msgindex].dumpit = NULL; - handlers[msgindex].flags = 0; + link = tab[msgindex]; + rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); + kfree_rcu(link, rcu); + return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); @@ -251,20 +308,27 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; + int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rtnl_msg_handlers[protocol]; RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); + for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { + link = tab[msgindex]; + if (!link) + continue; + + rcu_assign_pointer(tab[msgindex], NULL); + kfree_rcu(link, rcu); + } rtnl_unlock(); synchronize_net(); - while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1) - schedule(); - kfree(handlers); + kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); @@ -840,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -1194,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) { + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } @@ -1261,6 +1333,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) { const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; + struct netdev_bpf xdp; ASSERT_RTNL(); @@ -1273,7 +1346,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); + __dev_xdp_query(dev, ops->ndo_bpf, &xdp); + *prog_id = xdp.prog_id; + + return xdp.prog_attached; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) @@ -1569,6 +1645,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, @@ -2219,6 +2297,34 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); + + if (max_size > GSO_MAX_SIZE) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_size ^ max_size) { + netif_set_gso_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_MAX_SEGS]) { + u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + + if (max_segs > GSO_MAX_SEGS) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_segs ^ max_segs) { + dev->gso_max_segs = max_segs; + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -2583,6 +2689,10 @@ struct net_device *rtnl_create_link(struct net *net, dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + if (tb[IFLA_GSO_MAX_SIZE]) + netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); + if (tb[IFLA_GSO_MAX_SEGS]) + dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } @@ -2973,18 +3083,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + struct rtnl_link **tab; int type = cb->nlh->nlmsg_type-RTM_BASE; - struct rtnl_link *handlers; + struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; - handlers = rtnl_dereference(rtnl_msg_handlers[idx]); - if (!handlers) + if (type < 0 || type >= RTM_NR_MSGTYPES) continue; - dumpit = READ_ONCE(handlers[type].dumpit); + tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); + if (!tab) + continue; + + link = tab[type]; + if (!link) + continue; + + dumpit = link->dumpit; if (!dumpit) continue; @@ -4314,7 +4432,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct rtnl_link *handlers; + struct rtnl_link *link; + struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; @@ -4338,79 +4457,85 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (family >= ARRAY_SIZE(rtnl_msg_handlers)) - family = PF_UNSPEC; - rcu_read_lock(); - handlers = rcu_dereference(rtnl_msg_handlers[family]); - if (!handlers) { - family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); - } - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; u16 min_dump_alloc = 0; - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) { + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]); - if (!handlers) - goto err_unlock; - - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) goto err_unlock; } - - refcount_inc(&rtnl_msg_handlers_ref[family]); + owner = link->owner; + dumpit = link->dumpit; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); + err = 0; + /* need to do this before rcu_read_unlock() */ + if (!try_module_get(owner)) + err = -EPROTONOSUPPORT; + rcu_read_unlock(); rtnl = net->rtnl; - { + if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, + .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); + /* netlink_dump_start() will keep a reference on + * module if dump is still in progress. + */ + module_put(owner); } - refcount_dec(&rtnl_msg_handlers_ref[family]); return err; } - doit = READ_ONCE(handlers[type].doit); - if (!doit) { + link = rtnl_get_link(family, type); + if (!link || !link->doit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); + link = rtnl_get_link(PF_UNSPEC, type); + if (!link || !link->doit) + goto out_unlock; + } + + owner = link->owner; + if (!try_module_get(owner)) { + err = -EPROTONOSUPPORT; + goto out_unlock; } - flags = READ_ONCE(handlers[type].flags); + flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { - refcount_inc(&rtnl_msg_handlers_ref[family]); - doit = READ_ONCE(handlers[type].doit); + doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); - refcount_dec(&rtnl_msg_handlers_ref[family]); + module_put(owner); return err; } - rcu_read_unlock(); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[family]); - if (handlers) { - doit = READ_ONCE(handlers[type].doit); - if (doit) - err = doit(skb, nlh, extack); - } + link = rtnl_get_link(family, type); + if (link && link->doit) + err = link->doit(skb, nlh, extack); rtnl_unlock(); + + module_put(owner); + + return err; + +out_unlock: + rcu_read_unlock(); return err; err_unlock: @@ -4498,11 +4623,6 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++) - refcount_set(&rtnl_msg_handlers_ref[i], 1); - if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 08f574081315..01e8285aea73 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3656,6 +3656,10 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; + while (pos < offset + len) { if (i >= nfrags) { BUG_ON(skb_headlen(list_skb)); @@ -3667,6 +3671,11 @@ normal: BUG_ON(!nfrags); + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, + GFP_ATOMIC)) + goto err; + list_skb = list_skb->next; } @@ -3678,11 +3687,6 @@ normal: goto err; } - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) - goto err; - if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) - goto err; - *nskb_frag = *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); diff --git a/net/core/sock.c b/net/core/sock.c index c0b5b2f17412..abf4cbff99b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -145,6 +145,8 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +static void sock_inuse_add(struct net *net, int val); + /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through @@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; - if (likely(sk->sk_net_refcnt)) + if (likely(sk->sk_net_refcnt)) { get_net(net); + sock_inuse_add(net, 1); + } + sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); @@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk) static void __sk_free(struct sock *sk) { + if (likely(sk->sk_net_refcnt)) + sock_inuse_add(sock_net(sk), -1); + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) sock_diag_broadcast_destroy(sk); else @@ -1716,6 +1724,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + if (likely(newsk->sk_net_refcnt)) + sock_inuse_add(sock_net(newsk), 1); /* * Before updating sk_refcnt, we must commit prior changes to memory @@ -3045,7 +3055,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); + __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -3055,21 +3065,50 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) int res = 0; for_each_possible_cpu(cpu) - res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +static void sock_inuse_add(struct net *net, int val) +{ + this_cpu_add(*net->core.sock_inuse, val); +} + +int sock_inuse_get(struct net *net) +{ + int cpu, res = 0; + + for_each_possible_cpu(cpu) + res += *per_cpu_ptr(net->core.sock_inuse, cpu); + + return res; +} + +EXPORT_SYMBOL_GPL(sock_inuse_get); + static int __net_init sock_inuse_init_net(struct net *net) { - net->core.inuse = alloc_percpu(struct prot_inuse); - return net->core.inuse ? 0 : -ENOMEM; + net->core.prot_inuse = alloc_percpu(struct prot_inuse); + if (net->core.prot_inuse == NULL) + return -ENOMEM; + + net->core.sock_inuse = alloc_percpu(int); + if (net->core.sock_inuse == NULL) + goto out; + + return 0; + +out: + free_percpu(net->core.prot_inuse); + return -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { - free_percpu(net->core.inuse); + free_percpu(net->core.prot_inuse); + free_percpu(net->core.sock_inuse); } static struct pernet_operations net_inuse_ops = { @@ -3112,6 +3151,10 @@ static inline void assign_proto_idx(struct proto *prot) static inline void release_proto_idx(struct proto *prot) { } + +static void sock_inuse_add(struct net *net, int val) +{ +} #endif static void req_prot_cleanup(struct request_sock_ops *rsk_prot) @@ -3319,7 +3362,6 @@ static int proto_seq_open(struct inode *inode, struct file *file) } static const struct file_operations proto_seq_fops = { - .owner = THIS_MODULE, .open = proto_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5eeb1d20cc38..c5bb52bc73a1 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk, if (prog && skb) sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); - else + + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; } diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 000000000000..097a0f74e004 --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,73 @@ +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#include <linux/types.h> +#include <linux/mm.h> + +#include <net/xdp.h> + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index) +{ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); + +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) +{ + return (xdp_rxq->reg_state == REG_STATE_REGISTERED); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); |