diff options
Diffstat (limited to 'drivers/net/tun.c')
-rw-r--r-- | drivers/net/tun.c | 291 |
1 files changed, 178 insertions, 113 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1572878c3403..7a3ab3427369 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -209,6 +209,9 @@ struct tun_struct { struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; + /* init args */ + struct file *file; + struct ifreq *ifr; }; struct veth { @@ -216,6 +219,9 @@ struct veth { __be16 h_vlan_TCI; }; +static void tun_flow_init(struct tun_struct *tun); +static void tun_flow_uninit(struct tun_struct *tun); + static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); @@ -262,12 +268,17 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { - netif_tx_napi_add(tun->dev, &tfile->napi, tun_napi_poll, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } +static void tun_napi_enable(struct tun_file *tfile) +{ + if (tfile->napi_enabled) + napi_enable(&tfile->napi); +} + static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) @@ -629,7 +640,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun = rtnl_dereference(tfile->tun); if (tun && clean) { - tun_napi_disable(tfile); + if (!tfile->detached) + tun_napi_disable(tfile); tun_napi_del(tfile); } @@ -648,8 +660,10 @@ static void __tun_detach(struct tun_file *tfile, bool clean) if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); - } else + } else { tun_disable_queue(tun, tfile); + tun_napi_disable(tfile); + } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); @@ -722,6 +736,7 @@ static void tun_detach_all(struct net_device *dev) sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { + tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); @@ -802,6 +817,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, if (tfile->detached) { tun_enable_queue(tfile); + tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); @@ -953,6 +969,49 @@ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) static const struct ethtool_ops tun_ethtool_ops; +static int tun_net_init(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + struct ifreq *ifr = tun->ifr; + int err; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + spin_lock_init(&tun->lock); + + err = security_tun_dev_alloc_security(&tun->security); + if (err < 0) { + free_percpu(dev->tstats); + return err; + } + + tun_flow_init(tun); + + dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | + TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + dev->features = dev->hw_features | NETIF_F_LLTX; + dev->vlan_features = dev->features & + ~(NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); + + INIT_LIST_HEAD(&tun->disabled); + err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, + ifr->ifr_flags & IFF_NAPI_FRAGS, false); + if (err < 0) { + tun_flow_uninit(tun); + security_tun_dev_free_security(tun->security); + free_percpu(dev->tstats); + return err; + } + return 0; +} + /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { @@ -1009,6 +1068,7 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun, static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); + enum skb_drop_reason drop_reason; int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; @@ -1018,8 +1078,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ - if (!tfile) + if (!tfile) { + drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; + } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); @@ -1029,19 +1091,32 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ - if (!check_filter(&tun->txflt, skb)) + if (!check_filter(&tun->txflt, skb)) { + drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; + } if (tfile->socket.sk->sk_filter && - sk_filter(tfile->socket.sk, skb)) + sk_filter(tfile->socket.sk, skb)) { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; + } len = run_ebpf_filter(tun, skb, len); - if (len == 0 || pskb_trim(skb, len)) + if (len == 0) { + drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; + } - if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) + if (pskb_trim(skb, len)) { + drop_reason = SKB_DROP_REASON_NOMEM; goto drop; + } + + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; + goto drop; + } skb_tx_timestamp(skb); @@ -1052,12 +1127,14 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); - if (ptr_ring_produce(&tfile->tx_ring, skb)) + if (ptr_ring_produce(&tfile->tx_ring, skb)) { + drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; + } /* NETIF_F_LLTX requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); - queue->trans_start = jiffies; + txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) @@ -1068,9 +1145,9 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; drop: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } @@ -1169,6 +1246,7 @@ static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) } static const struct net_device_ops tun_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1223,7 +1301,7 @@ resample: void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; @@ -1252,6 +1330,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) } static const struct net_device_ops tap_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1292,7 +1371,7 @@ static void tun_flow_uninit(struct tun_struct *tun) #define MAX_MTU 65535 /* Initialize net device. */ -static void tun_net_init(struct net_device *dev) +static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -1380,7 +1459,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, int err; int i; - if (it->nr_segs > MAX_SKB_FRAGS + 1) + if (it->nr_segs > MAX_SKB_FRAGS + 1 || + len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); @@ -1551,13 +1631,13 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); break; } @@ -1666,6 +1746,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); + enum skb_drop_reason drop_reason; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) @@ -1727,7 +1808,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, */ skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); if (IS_ERR(skb)) { - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); return PTR_ERR(skb); } if (!skb) @@ -1756,7 +1837,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); if (frags) mutex_unlock(&tfile->napi_mutex); return PTR_ERR(skb); @@ -1769,9 +1850,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (err) { err = -EFAULT; + drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; drop: - atomic_long_inc(&tun->dev->rx_dropped); - kfree_skb(skb); + dev_core_stats_rx_dropped_inc(tun->dev); + kfree_skb_reason(skb, drop_reason); if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); @@ -1805,7 +1887,7 @@ drop: pi.proto = htons(ETH_P_IPV6); break; default: - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); kfree_skb(skb); return -EINVAL; } @@ -1818,6 +1900,7 @@ drop: case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; + drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); @@ -1871,6 +1954,7 @@ drop: if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); + drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } @@ -1883,17 +1967,25 @@ drop: skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { - atomic_long_inc(&tun->dev->rx_dropped); + WARN_ON_ONCE(1); + err = -ENOMEM; + dev_core_stats_rx_dropped_inc(tun->dev); +napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); - WARN_ON(1); - return -ENOMEM; + return err; } - local_bh_disable(); - napi_gro_frags(&tfile->napi); - local_bh_enable(); + if (likely(napi_schedule_prep(&tfile->napi))) { + local_bh_disable(); + napi_gro_frags(&tfile->napi); + napi_complete(&tfile->napi); + local_bh_enable(); + } else { + err = -EBUSY; + goto napi_busy; + } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; @@ -1911,7 +2003,7 @@ drop: } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { - netif_rx_ni(skb); + netif_rx(skb); } rcu_read_unlock(); @@ -2206,11 +2298,6 @@ static void tun_free_netdev(struct net_device *dev) BUG_ON(!(list_empty(&tun->disabled))); free_percpu(dev->tstats); - /* We clear tstats so that tun_set_iff() can tell if - * tun_free_netdev() has been called from register_netdevice(). - */ - dev->tstats = NULL; - tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); @@ -2342,9 +2429,10 @@ static int tun_xdp_one(struct tun_struct *tun, struct virtio_net_hdr *gso = &hdr->gso; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; + struct sk_buff_head *queue; u32 rxhash = 0, act; int buflen = hdr->buflen; - int err = 0; + int ret = 0; bool skb_xdp = false; struct page *page; @@ -2359,13 +2447,13 @@ static int tun_xdp_one(struct tun_struct *tun, xdp_set_data_meta_invalid(xdp); act = bpf_prog_run_xdp(xdp_prog, xdp); - err = tun_xdp_act(tun, xdp_prog, xdp, act); - if (err < 0) { + ret = tun_xdp_act(tun, xdp_prog, xdp, act); + if (ret < 0) { put_page(virt_to_head_page(xdp->data)); - return err; + return ret; } - switch (err) { + switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; @@ -2389,7 +2477,7 @@ static int tun_xdp_one(struct tun_struct *tun, build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -2399,7 +2487,7 @@ build: if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); - err = -EINVAL; + ret = -EINVAL; goto out; } @@ -2409,16 +2497,27 @@ build: skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { - err = do_xdp_generic(xdp_prog, skb); - if (err != XDP_PASS) + ret = do_xdp_generic(xdp_prog, skb); + if (ret != XDP_PASS) { + ret = 0; goto out; + } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); - netif_receive_skb(skb); + if (tfile->napi_enabled) { + queue = &tfile->sk.sk_write_queue; + spin_lock(&queue->lock); + __skb_queue_tail(queue, skb); + spin_unlock(&queue->lock); + ret = 1; + } else { + netif_receive_skb(skb); + ret = 0; + } /* No need to disable preemption here since this function is * always called with bh disabled @@ -2429,7 +2528,7 @@ build: tun_flow_update(tun, rxhash, tfile); out: - return err; + return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) @@ -2443,10 +2542,11 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (!tun) return -EBADFD; - if (ctl && (ctl->type == TUN_MSG_PTR)) { + if (m->msg_controllen == sizeof(struct tun_msg_ctl) && + ctl && ctl->type == TUN_MSG_PTR) { struct tun_page tpage; int n = ctl->num; - int flush = 0; + int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); @@ -2455,12 +2555,17 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; - tun_xdp_one(tun, tfile, xdp, &flush, &tpage); + ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); + if (ret > 0) + queued += ret; } if (flush) xdp_do_flush(); + if (tfile->napi_enabled && queued > 0) + napi_schedule(&tfile->napi); + rcu_read_unlock(); local_bh_enable(); @@ -2568,7 +2673,7 @@ static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); - return sprintf(buf, "0x%x\n", tun_flags(tun)); + return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, @@ -2576,9 +2681,9 @@ static ssize_t owner_show(struct device *dev, struct device_attribute *attr, { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? - sprintf(buf, "%u\n", - from_kuid_munged(current_user_ns(), tun->owner)): - sprintf(buf, "-1\n"); + sysfs_emit(buf, "%u\n", + from_kuid_munged(current_user_ns(), tun->owner)) : + sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, @@ -2586,9 +2691,9 @@ static ssize_t group_show(struct device *dev, struct device_attribute *attr, { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? - sprintf(buf, "%u\n", - from_kgid_munged(current_user_ns(), tun->group)): - sprintf(buf, "-1\n"); + sysfs_emit(buf, "%u\n", + from_kgid_munged(current_user_ns(), tun->group)) : + sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); @@ -2716,48 +2821,26 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) { - err = -ENOMEM; - goto err_free_dev; - } + tun->ifr = ifr; + tun->file = file; - spin_lock_init(&tun->lock); - - err = security_tun_dev_alloc_security(&tun->security); - if (err < 0) - goto err_free_stat; - - tun_net_init(dev); - tun_flow_init(tun); - - dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | - TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; - dev->features = dev->hw_features | NETIF_F_LLTX; - dev->vlan_features = dev->features & - ~(NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); - - tun->flags = (tun->flags & ~TUN_FEATURES) | - (ifr->ifr_flags & TUN_FEATURES); - - INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI, - ifr->ifr_flags & IFF_NAPI_FRAGS, false); - if (err < 0) - goto err_free_flow; + tun_net_initialize(dev); err = register_netdevice(tun->dev); - if (err < 0) - goto err_detach; + if (err < 0) { + free_netdev(dev); + return err; + } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } - netif_carrier_on(tun->dev); + if (ifr->ifr_flags & IFF_NO_CARRIER) + netif_carrier_off(tun->dev); + else + netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. @@ -2767,24 +2850,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) strcpy(ifr->ifr_name, tun->dev->name); return 0; - -err_detach: - tun_detach_all(dev); - /* We are here because register_netdevice() has failed. - * If register_netdevice() already called tun_free_netdev() - * while dealing with the error, dev->stats has been cleared. - */ - if (!dev->tstats) - goto err_free_dev; - -err_free_flow: - tun_flow_uninit(tun); - security_tun_dev_free_security(tun->security); -err_free_stat: - free_percpu(dev->tstats); -err_free_dev: - free_netdev(dev); - return err; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) @@ -3003,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, * This is needed because we never checked for invalid flags on * TUNSETIFF. */ - return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES, - (unsigned int __user*)argp); + return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | + TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { @@ -3487,15 +3552,15 @@ static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info { struct tun_struct *tun = netdev_priv(dev); - strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); - strlcpy(info->version, DRV_VERSION, sizeof(info->version)); + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); + strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: - strlcpy(info->bus_info, "tun", sizeof(info->bus_info)); + strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: - strlcpy(info->bus_info, "tap", sizeof(info->bus_info)); + strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } |