From 0287587884b15041203b3a362d485e1ab1f24445 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 18:38:35 -0700 Subject: net: better IFF_XMIT_DST_RELEASE support Testing xmit_more support with netperf and connected UDP sockets, I found strange dst refcount false sharing. Current handling of IFF_XMIT_DST_RELEASE is not optimal. Dropping dst in validate_xmit_skb() is certainly too late in case packet was queued by cpu X but dequeued by cpu Y The logical point to take care of drop/force is in __dev_queue_xmit() before even taking qdisc lock. As Julian Anastasov pointed out, need for skb_dst() might come from some packet schedulers or classifiers. This patch adds new helper to cleanly express needs of various drivers or qdiscs/classifiers. Drivers that need skb_dst() in their ndo_start_xmit() should call following helper in their setup instead of the prior : dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; -> netif_keep_dst(dev); Instead of using a single bit, we use two bits, one being eventually rebuilt in bonding/team drivers. The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being rebuilt in bonding/team. Eventually, we could add something smarter later. Signed-off-by: Eric Dumazet Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 3 ++- net/atm/clip.c | 2 +- net/core/dev.c | 19 +++++++++---------- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 2 +- net/sched/cls_flow.c | 2 ++ net/sched/cls_route.c | 1 + net/sched/sch_generic.c | 3 --- net/sched/sch_teql.c | 2 +- 14 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 35a6b6b15e8a..0d441ec8763e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; diff --git a/net/atm/clip.c b/net/atm/clip.c index 1d9eaa4f041a..17e55dfecbe2 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev) /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int clip_create(int number) diff --git a/net/core/dev.c b/net/core/dev.c index a63b8c43c1b6..3c5bdaa44486 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (skb->next) return skb; - /* If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); - features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); qdisc_bstats_update(q, skb); @@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { @@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.lower); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); dev->num_tx_queues = txqs; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0485ef18d254..12055fdbe716 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->addr_len = 4; if (iph->daddr) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e453cb724a95..3e861011e4a3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ea88ab3102a8..37096d64730e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74b677916a70..de3b1c86b8d3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->iflink = 0; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int ip6gre_tunnel_init(struct net_device *dev) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d3e8888ad611..9409887fb664 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->mtu -= 8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5833a2244467..d440bb585524 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev) dev->mtu = ETH_DATA_LEN; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /** diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0d4e27466f82..6eab37cf5345 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a5d2b20db560..4ac515f2a6ce 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, tcf_exts_change(tp, &fnew->exts, &e); tcf_em_tree_change(tp, &fnew->ematches, &t); + netif_keep_dst(qdisc_dev(tp->q)); + if (tb[TCA_FLOW_KEYS]) { fnew->keymask = keymask; fnew->nkeys = nkeys; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f22baae0afa..109a329b7198 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (f->handle < f1->handle) break; + netif_keep_dst(qdisc_dev(tp->q)); rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2b349a4de3c8..38d58e6cef07 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops); static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - skb_dst_force(skb); q->gso_skb = skb; q->qstats.requeues++; q->q.qlen++; /* it's still part of the queue */ @@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q) if (unlikely(!skb)) return 0; - WARN_ON_ONCE(skb_dst_is_noref(skb)); - root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 5cd291bd00e4..6ada42396a24 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev) dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static LIST_HEAD(master_dev_list); -- cgit v1.2.3-59-g8ed1b