aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c333
1 files changed, 212 insertions, 121 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 945bbd001359..683d493aa1bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
+#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
@@ -133,6 +134,7 @@
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
+#include <linux/hrtimer.h>
#include "net-sysfs.h"
@@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close);
*/
void dev_disable_lro(struct net_device *dev)
{
- /*
- * If we're trying to disable lro on a vlan device
- * use the underlying physical device instead
- */
- if (is_vlan_dev(dev))
- dev = vlan_dev_real_dev(dev);
-
- /* the same for macvlan devices */
- if (netif_is_macvlan(dev))
- dev = macvlan_dev_real_dev(dev);
+ struct net_device *lower_dev;
+ struct list_head *iter;
dev->wanted_features &= ~NETIF_F_LRO;
netdev_update_features(dev);
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter)
+ dev_disable_lro(lower_dev);
}
EXPORT_SYMBOL(dev_disable_lro);
@@ -1697,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
skb_scrub_packet(skb, true);
skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
return 0;
}
@@ -2525,12 +2523,12 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
/* If MPLS offload request, verify we are testing hardware MPLS features
* instead of standard features for the netdev.
*/
-#ifdef CONFIG_NET_MPLS_GSO
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{
- if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ if (eth_p_mpls(type))
features &= skb->dev->mpls_features;
return features;
@@ -2565,7 +2563,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
- const struct net_device *dev = skb->dev;
+ struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
u16 gso_segs = skb_shinfo(skb)->gso_segs;
__be16 protocol = skb->protocol;
@@ -2573,11 +2571,21 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
features &= ~NETIF_F_GSO_MASK;
- if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
- struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
- protocol = veh->h_vlan_encapsulated_proto;
- } else if (!vlan_tx_tag_present(skb)) {
- return harmonize_features(skb, features);
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (!vlan_tx_tag_present(skb)) {
+ if (unlikely(protocol == htons(ETH_P_8021Q) ||
+ protocol == htons(ETH_P_8021AD))) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ protocol = veh->h_vlan_encapsulated_proto;
+ } else {
+ goto finalize;
+ }
}
features = netdev_intersect_features(features,
@@ -2594,6 +2602,11 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
+finalize:
+ if (dev->netdev_ops->ndo_features_check)
+ features &= dev->netdev_ops->ndo_features_check(skb, dev,
+ features);
+
return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);
@@ -2647,12 +2660,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
netdev_features_t features)
{
if (vlan_tx_tag_present(skb) &&
- !vlan_hw_offload_capable(features, skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto,
- vlan_tx_tag_get(skb));
- if (skb)
- skb->vlan_tci = 0;
- }
+ !vlan_hw_offload_capable(features, skb->vlan_proto))
+ skb = __vlan_hwaccel_push_inside(skb);
return skb;
}
@@ -2668,19 +2677,12 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (unlikely(!skb))
goto out_null;
- /* If encapsulation offload request, verify we are testing
- * hardware encapsulation features instead of standard
- * features for the netdev
- */
- if (skb->encapsulation)
- features &= dev->hw_enc_features;
-
if (netif_needs_gso(dev, skb, features)) {
struct sk_buff *segs;
segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
- segs = NULL;
+ goto out_kfree_skb;
} else if (segs) {
consume_skb(skb);
skb = segs;
@@ -3304,7 +3306,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
rps_lock(sd);
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
- if (skb_queue_len(&sd->input_pkt_queue)) {
+ if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
@@ -4179,7 +4181,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
struct sk_buff *skb = napi->skb;
if (!skb) {
- skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
napi->skb = skb;
}
return skb;
@@ -4316,20 +4318,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
local_irq_enable();
}
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ return sd->rps_ipi_list != NULL;
+#else
+ return false;
+#endif
+}
+
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
-#ifdef CONFIG_RPS
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
- if (sd->rps_ipi_list) {
+ if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
-#endif
+
napi->weight = weight_p;
local_irq_disable();
while (1) {
@@ -4356,7 +4366,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
- list_del(&napi->poll_list);
napi->state = 0;
rps_unlock(sd);
@@ -4376,7 +4385,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
- * The entry's receive function will be scheduled to run
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
void __napi_schedule(struct napi_struct *n)
{
@@ -4388,18 +4398,29 @@ void __napi_schedule(struct napi_struct *n)
}
EXPORT_SYMBOL(__napi_schedule);
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
void __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
- BUG_ON(n->gro_list);
- list_del(&n->poll_list);
+ list_del_init(&n->poll_list);
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
-void napi_complete(struct napi_struct *n)
+void napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags;
@@ -4410,12 +4431,28 @@ void napi_complete(struct napi_struct *n)
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
- napi_gro_flush(n, false);
- local_irq_save(flags);
- __napi_complete(n);
- local_irq_restore(flags);
+ if (n->gro_list) {
+ unsigned long timeout = 0;
+
+ if (work_done)
+ timeout = n->dev->gro_flush_timeout;
+
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ else
+ napi_gro_flush(n, false);
+ }
+ if (likely(list_empty(&n->poll_list))) {
+ WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+ } else {
+ /* If n->poll_list is not empty, we need to mask irqs */
+ local_irq_save(flags);
+ __napi_complete(n);
+ local_irq_restore(flags);
+ }
}
-EXPORT_SYMBOL(napi_complete);
+EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4469,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi)
}
EXPORT_SYMBOL_GPL(napi_hash_del);
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+ struct napi_struct *napi;
+
+ napi = container_of(timer, struct napi_struct, timer);
+ if (napi->gro_list)
+ napi_schedule(napi);
+
+ return HRTIMER_NORESTART;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
+ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->timer.function = napi_watchdog;
napi->gro_count = 0;
napi->gro_list = NULL;
napi->skb = NULL;
@@ -4491,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
}
EXPORT_SYMBOL(netif_napi_add);
+void napi_disable(struct napi_struct *n)
+{
+ might_sleep();
+ set_bit(NAPI_STATE_DISABLE, &n->state);
+
+ while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+ msleep(1);
+
+ hrtimer_cancel(&n->timer);
+
+ clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
void netif_napi_del(struct napi_struct *napi)
{
list_del_init(&napi->dev_list);
@@ -4502,91 +4566,112 @@ void netif_napi_del(struct napi_struct *napi)
}
EXPORT_SYMBOL(netif_napi_del);
-static void net_rx_action(struct softirq_action *h)
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
- unsigned long time_limit = jiffies + 2;
- int budget = netdev_budget;
void *have;
+ int work, weight;
- local_irq_disable();
+ list_del_init(&n->poll_list);
- while (!list_empty(&sd->poll_list)) {
- struct napi_struct *n;
- int work, weight;
+ have = netpoll_poll_lock(n);
- /* If softirq window is exhuasted then punt.
- * Allow this to run for 2 jiffies since which will allow
- * an average latency of 1.5/HZ.
- */
- if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
- goto softnet_break;
+ weight = n->weight;
- local_irq_enable();
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n);
+ }
- /* Even though interrupts have been re-enabled, this
- * access is safe because interrupts can only add new
- * entries to the tail of this list, and only ->poll()
- * calls can remove this head entry from the list.
- */
- n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+ WARN_ON_ONCE(work > weight);
- have = netpoll_poll_lock(n);
+ if (likely(work < weight))
+ goto out_unlock;
- weight = n->weight;
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ goto out_unlock;
+ }
- /* This NAPI_STATE_SCHED test is for avoiding a race
- * with netpoll's poll_napi(). Only the entity which
- * obtains the lock and sees NAPI_STATE_SCHED set will
- * actually make the ->poll() call. Therefore we avoid
- * accidentally calling ->poll() when NAPI is not scheduled.
+ if (n->gro_list) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
*/
- work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- work = n->poll(n, weight);
- trace_napi_poll(n);
- }
+ napi_gro_flush(n, HZ >= 1000);
+ }
- WARN_ON_ONCE(work > weight);
+ /* Some drivers may have called napi_schedule
+ * prior to exhausting their budget.
+ */
+ if (unlikely(!list_empty(&n->poll_list))) {
+ pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+ n->dev ? n->dev->name : "backlog");
+ goto out_unlock;
+ }
- budget -= work;
+ list_add_tail(&n->poll_list, repoll);
- local_irq_disable();
+out_unlock:
+ netpoll_poll_unlock(have);
- /* Drivers must not modify the NAPI state if they
- * consume the entire weight. In such cases this code
- * still "owns" the NAPI instance and therefore can
- * move the instance around on the list at-will.
- */
- if (unlikely(work == weight)) {
- if (unlikely(napi_disable_pending(n))) {
- local_irq_enable();
- napi_complete(n);
- local_irq_disable();
- } else {
- if (n->gro_list) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- local_irq_enable();
- napi_gro_flush(n, HZ >= 1000);
- local_irq_disable();
- }
- list_move_tail(&n->poll_list, &sd->poll_list);
- }
+ return work;
+}
+
+static void net_rx_action(struct softirq_action *h)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+ unsigned long time_limit = jiffies + 2;
+ int budget = netdev_budget;
+ LIST_HEAD(list);
+ LIST_HEAD(repoll);
+
+ local_irq_disable();
+ list_splice_init(&sd->poll_list, &list);
+ local_irq_enable();
+
+ for (;;) {
+ struct napi_struct *n;
+
+ if (list_empty(&list)) {
+ if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
+ return;
+ break;
}
- netpoll_poll_unlock(have);
+ n = list_first_entry(&list, struct napi_struct, poll_list);
+ budget -= napi_poll(n, &repoll);
+
+ /* If softirq window is exhausted then punt.
+ * Allow this to run for 2 jiffies since which will allow
+ * an average latency of 1.5/HZ.
+ */
+ if (unlikely(budget <= 0 ||
+ time_after_eq(jiffies, time_limit))) {
+ sd->time_squeeze++;
+ break;
+ }
}
-out:
- net_rps_action_and_irq_enable(sd);
- return;
+ local_irq_disable();
-softnet_break:
- sd->time_squeeze++;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- goto out;
+ list_splice_tail_init(&sd->poll_list, &list);
+ list_splice_tail(&repoll, &list);
+ list_splice(&list, &sd->poll_list);
+ if (!list_empty(&sd->poll_list))
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
+ net_rps_action_and_irq_enable(sd);
}
struct netdev_adjacent {
@@ -5786,7 +5871,7 @@ EXPORT_SYMBOL(dev_change_carrier);
* Get device physical port ID
*/
int dev_get_phys_port_id(struct net_device *dev,
- struct netdev_phys_port_id *ppid)
+ struct netdev_phys_item_id *ppid)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -5865,6 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
/* Shutdown queueing discipline. */
dev_shutdown(dev);
@@ -5874,6 +5961,11 @@ static void rollback_registered_many(struct list_head *head)
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+ GFP_KERNEL);
+
/*
* Flush the unicast and multicast chains
*/
@@ -5883,9 +5975,8 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
@@ -7200,11 +7291,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
*/
struct net *net;
bool unregistering;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
- prepare_to_wait(&netdev_unregistering_wq, &wait,
- TASK_UNINTERRUPTIBLE);
unregistering = false;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
@@ -7216,9 +7306,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
if (!unregistering)
break;
__rtnl_unlock();
- schedule();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
- finish_wait(&netdev_unregistering_wq, &wait);
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
}
static void __net_exit default_device_exit_batch(struct list_head *net_list)