aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/dev.c487
-rw-r--r--net/core/dev_ioctl.c7
-rw-r--r--net/core/ethtool.c82
-rw-r--r--net/core/filter.c45
-rw-r--r--net/core/flow_dissector.c115
-rw-r--r--net/core/gen_estimator.c29
-rw-r--r--net/core/gen_stats.c112
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/netpoll.c7
-rw-r--r--net/core/pktgen.c76
-rw-r--r--net/core/rtnetlink.c66
-rw-r--r--net/core/secure_seq.c6
-rw-r--r--net/core/skbuff.c398
-rw-r--r--net/core/sock.c118
-rw-r--r--net/core/timestamping.c43
-rw-r--r--net/core/user_dma.c131
-rw-r--r--net/core/utils.c12
18 files changed, 984 insertions, 753 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 71093d94ad2b..235e6c50708d 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,7 +16,6 @@ obj-y += net-sysfs.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
-obj-$(CONFIG_NET_DMA) += user_dma.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
diff --git a/net/core/dev.c b/net/core/dev.c
index ab9a16530c36..4699dcfdc4ab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -897,23 +897,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
- * dev_get_by_flags_rcu - find any device with given flags
+ * __dev_get_by_flags - find any device with given flags
* @net: the applicable net namespace
* @if_flags: IFF_* values
* @mask: bitmask of bits in if_flags to check
*
* Search for any interface with the given flags. Returns NULL if a device
* is not found or a pointer to the device. Must be called inside
- * rcu_read_lock(), and result refcount is unchanged.
+ * rtnl_lock(), and result refcount is unchanged.
*/
-struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
- unsigned short mask)
+struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
+ unsigned short mask)
{
struct net_device *dev, *ret;
+ ASSERT_RTNL();
+
ret = NULL;
- for_each_netdev_rcu(net, dev) {
+ for_each_netdev(net, dev) {
if (((dev->flags ^ if_flags) & mask) == 0) {
ret = dev;
break;
@@ -921,7 +923,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags
}
return ret;
}
-EXPORT_SYMBOL(dev_get_by_flags_rcu);
+EXPORT_SYMBOL(__dev_get_by_flags);
/**
* dev_valid_name - check if name is okay for network device
@@ -1284,7 +1286,6 @@ static int __dev_open(struct net_device *dev)
clear_bit(__LINK_STATE_START, &dev->state);
else {
dev->flags |= IFF_UP;
- net_dmaengine_get();
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1363,7 +1364,6 @@ static int __dev_close_many(struct list_head *head)
ops->ndo_stop(dev);
dev->flags &= ~IFF_UP;
- net_dmaengine_put();
netpoll_poll_enable(dev);
}
@@ -2177,6 +2177,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
return (struct dev_kfree_skb_cb *)skb->cb;
}
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+ rcu_read_lock();
+ if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+ __netif_schedule(q);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+/**
+ * netif_wake_subqueue - allow sending packets on subqueue
+ * @dev: network device
+ * @queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(txq->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_wake_subqueue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(dev_queue->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
unsigned long flags;
@@ -2373,16 +2420,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- int err;
-
- err = ptype->callbacks.gso_send_check(skb);
- segs = ERR_PTR(err);
- if (err || skb_gso_ok(skb, features))
- break;
- __skb_push(skb, (skb->data -
- skb_network_header(skb)));
- }
segs = ptype->callbacks.gso_segment(skb, features);
break;
}
@@ -2485,52 +2522,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
return 0;
}
-struct dev_gso_cb {
- void (*destructor)(struct sk_buff *skb);
-};
-
-#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
-
-static void dev_gso_skb_destructor(struct sk_buff *skb)
-{
- struct dev_gso_cb *cb;
-
- kfree_skb_list(skb->next);
- skb->next = NULL;
-
- cb = DEV_GSO_CB(skb);
- if (cb->destructor)
- cb->destructor(skb);
-}
-
-/**
- * dev_gso_segment - Perform emulated hardware segmentation on skb.
- * @skb: buffer to segment
- * @features: device features as applicable to this skb
- *
- * This function segments the given skb and stores the list of segments
- * in skb->next.
- */
-static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
-{
- struct sk_buff *segs;
-
- segs = skb_gso_segment(skb, features);
-
- /* Verifying header integrity only. */
- if (!segs)
- return 0;
-
- if (IS_ERR(segs))
- return PTR_ERR(segs);
-
- skb->next = segs;
- DEV_GSO_CB(skb)->destructor = skb->destructor;
- skb->destructor = dev_gso_skb_destructor;
-
- return 0;
-}
-
/* If MPLS offload request, verify we are testing hardware MPLS features
* instead of standard features for the netdev.
*/
@@ -2574,10 +2565,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
+ const struct net_device *dev = skb->dev;
+ netdev_features_t features = dev->features;
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
__be16 protocol = skb->protocol;
- netdev_features_t features = skb->dev->features;
- if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+ if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
features &= ~NETIF_F_GSO_MASK;
if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
@@ -2588,7 +2581,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
}
features = netdev_intersect_features(features,
- skb->dev->vlan_features |
+ dev->vlan_features |
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
@@ -2605,119 +2598,149 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_skb_features);
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
- struct netdev_queue *txq)
+static int xmit_one(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq, bool more)
{
- const struct net_device_ops *ops = dev->netdev_ops;
- int rc = NETDEV_TX_OK;
- unsigned int skb_len;
-
- if (likely(!skb->next)) {
- netdev_features_t features;
+ unsigned int len;
+ int rc;
- /*
- * If device doesn't need skb->dst, release it right now while
- * its hot in this cpu cache
- */
- if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
- skb_dst_drop(skb);
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(skb, dev);
- features = netif_skb_features(skb);
+ len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
+ rc = netdev_start_xmit(skb, dev, txq, more);
+ trace_net_dev_xmit(skb, rc, dev, len);
- if (vlan_tx_tag_present(skb) &&
- !vlan_hw_offload_capable(features, skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto,
- vlan_tx_tag_get(skb));
- if (unlikely(!skb))
- goto out;
+ return rc;
+}
- skb->vlan_tci = 0;
- }
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
+ struct netdev_queue *txq, int *ret)
+{
+ struct sk_buff *skb = first;
+ int rc = NETDEV_TX_OK;
- /* If encapsulation offload request, verify we are testing
- * hardware encapsulation features instead of standard
- * features for the netdev
- */
- if (skb->encapsulation)
- features &= dev->hw_enc_features;
+ while (skb) {
+ struct sk_buff *next = skb->next;
- if (netif_needs_gso(skb, features)) {
- if (unlikely(dev_gso_segment(skb, features)))
- goto out_kfree_skb;
- if (skb->next)
- goto gso;
- } else {
- if (skb_needs_linearize(skb, features) &&
- __skb_linearize(skb))
- goto out_kfree_skb;
+ skb->next = NULL;
+ rc = xmit_one(skb, dev, txq, next != NULL);
+ if (unlikely(!dev_xmit_complete(rc))) {
+ skb->next = next;
+ goto out;
+ }
- /* If packet is not checksummed and device does not
- * support checksumming for this protocol, complete
- * checksumming here.
- */
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- if (skb->encapsulation)
- skb_set_inner_transport_header(skb,
- skb_checksum_start_offset(skb));
- else
- skb_set_transport_header(skb,
- skb_checksum_start_offset(skb));
- if (!(features & NETIF_F_ALL_CSUM) &&
- skb_checksum_help(skb))
- goto out_kfree_skb;
- }
+ skb = next;
+ if (netif_xmit_stopped(txq) && skb) {
+ rc = NETDEV_TX_BUSY;
+ break;
}
+ }
- if (!list_empty(&ptype_all))
- dev_queue_xmit_nit(skb, dev);
+out:
+ *ret = rc;
+ return skb;
+}
- skb_len = skb->len;
- trace_net_dev_start_xmit(skb, dev);
- rc = ops->ndo_start_xmit(skb, dev);
- trace_net_dev_xmit(skb, rc, dev, skb_len);
- if (rc == NETDEV_TX_OK)
- txq_trans_update(txq);
- return rc;
+static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (vlan_tx_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (skb)
+ skb->vlan_tci = 0;
}
+ return skb;
+}
-gso:
- do {
- struct sk_buff *nskb = skb->next;
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+{
+ netdev_features_t features;
- skb->next = nskb->next;
- nskb->next = NULL;
+ if (skb->next)
+ return skb;
- if (!list_empty(&ptype_all))
- dev_queue_xmit_nit(nskb, dev);
-
- skb_len = nskb->len;
- trace_net_dev_start_xmit(nskb, dev);
- rc = ops->ndo_start_xmit(nskb, dev);
- trace_net_dev_xmit(nskb, rc, dev, skb_len);
- if (unlikely(rc != NETDEV_TX_OK)) {
- if (rc & ~NETDEV_TX_MASK)
- goto out_kfree_gso_skb;
- nskb->next = skb->next;
- skb->next = nskb;
- return rc;
+ features = netif_skb_features(skb);
+ skb = validate_xmit_vlan(skb, features);
+ if (unlikely(!skb))
+ goto out_null;
+
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (netif_needs_gso(skb, features)) {
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, features);
+ if (IS_ERR(segs)) {
+ segs = NULL;
+ } else if (segs) {
+ consume_skb(skb);
+ skb = segs;
}
- txq_trans_update(txq);
- if (unlikely(netif_xmit_stopped(txq) && skb->next))
- return NETDEV_TX_BUSY;
- } while (skb->next);
+ } else {
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
-out_kfree_gso_skb:
- if (likely(skb->next == NULL)) {
- skb->destructor = DEV_GSO_CB(skb)->destructor;
- consume_skb(skb);
- return rc;
+ /* If packet is not checksummed and device does not
+ * support checksumming for this protocol, complete
+ * checksumming here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (skb->encapsulation)
+ skb_set_inner_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ else
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (!(features & NETIF_F_ALL_CSUM) &&
+ skb_checksum_help(skb))
+ goto out_kfree_skb;
+ }
}
+
+ return skb;
+
out_kfree_skb:
kfree_skb(skb);
-out:
- return rc;
+out_null:
+ return NULL;
+}
+
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
+{
+ struct sk_buff *next, *head = NULL, *tail;
+
+ for (; skb != NULL; skb = next) {
+ next = skb->next;
+ skb->next = NULL;
+
+ /* in case skb wont be segmented, point to itself */
+ skb->prev = skb;
+
+ skb = validate_xmit_skb(skb, dev);
+ if (!skb)
+ continue;
+
+ if (!head)
+ head = skb;
+ else
+ tail->next = skb;
+ /* If skb was segmented, skb->prev points to
+ * the last segment. If not, it still contains skb.
+ */
+ tail = skb->prev;
+ }
+ return head;
}
-EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
@@ -2780,12 +2803,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
- if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
- skb_dst_force(skb);
qdisc_bstats_update(q, skb);
- if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
@@ -2796,7 +2817,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
- skb_dst_force(skb);
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
@@ -2893,6 +2913,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb_update_prio(skb);
+ /* If device/qdisc don't need skb->dst, release it right now while
+ * its hot in this cpu cache.
+ */
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -2925,11 +2953,15 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
goto recursion_alert;
+ skb = validate_xmit_skb(skb, dev);
+ if (!skb)
+ goto drop;
+
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
- rc = dev_hard_start_xmit(skb, dev, txq);
+ skb = dev_hard_start_xmit(skb, dev, txq, &rc);
__this_cpu_dec(xmit_recursion);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
@@ -2950,10 +2982,11 @@ recursion_alert:
}
rc = -ENETDOWN;
+drop:
rcu_read_unlock_bh();
atomic_long_inc(&dev->tx_dropped);
- kfree_skb(skb);
+ kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
@@ -3130,8 +3163,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
if (map) {
- tcpu = map->cpus[((u64) hash * map->len) >> 32];
-
+ tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) {
cpu = tcpu;
goto done;
@@ -3467,7 +3499,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
- q = rxq->qdisc;
+ q = rcu_dereference(rxq->qdisc);
if (q != &noop_qdisc) {
spin_lock(qdisc_lock(q));
if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
@@ -3484,7 +3516,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
{
struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
- if (!rxq || rxq->qdisc == &noop_qdisc)
+ if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
goto out;
if (*pt_prev) {
@@ -3965,11 +3997,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
- if (skb_is_gso(skb) || skb_has_frag_list(skb))
+ if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
goto normal;
gro_list_prepare(napi, skb);
- NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -3983,6 +4014,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->udp_mark = 0;
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ break;
+ default:
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ }
+
pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
break;
}
@@ -4212,6 +4259,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
}
EXPORT_SYMBOL(napi_gro_frags);
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
@@ -4505,14 +4577,6 @@ static void net_rx_action(struct softirq_action *h)
out:
net_rps_action_and_irq_enable(sd);
-#ifdef CONFIG_NET_DMA
- /*
- * There may not be any more sk_buffs coming right now, so push
- * any pending DMA copies to hardware
- */
- dma_issue_pending_all();
-#endif
-
return;
softnet_break:
@@ -4809,9 +4873,14 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
sysfs_remove_link(&(dev->dev.kobj), linkname);
}
-#define netdev_adjacent_is_neigh_list(dev, dev_list) \
- (dev_list == &dev->adj_list.upper || \
- dev_list == &dev->adj_list.lower)
+static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ return (dev_list == &dev->adj_list.upper ||
+ dev_list == &dev->adj_list.lower) &&
+ net_eq(dev_net(dev), dev_net(adj_dev));
+}
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
@@ -4841,7 +4910,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
pr_debug("dev_hold for %s, because of link added from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
- if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
if (ret)
goto free_adj;
@@ -4862,7 +4931,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
return 0;
remove_symlinks:
- if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
kfree(adj);
@@ -4895,8 +4964,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
if (adj->master)
sysfs_remove_link(&(dev->dev.kobj), "master");
- if (netdev_adjacent_is_neigh_list(dev, dev_list) &&
- net_eq(dev_net(dev),dev_net(adj_dev)))
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
@@ -6585,6 +6653,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->gso_min_segs = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
@@ -6594,7 +6663,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower);
- dev->priv_flags = IFF_XMIT_DST_RELEASE;
+ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
dev->num_tx_queues = txqs;
@@ -7016,53 +7085,45 @@ const char *netdev_drivername(const struct net_device *dev)
return empty;
}
-static int __netdev_printk(const char *level, const struct net_device *dev,
- struct va_format *vaf)
+static void __netdev_printk(const char *level, const struct net_device *dev,
+ struct va_format *vaf)
{
- int r;
-
if (dev && dev->dev.parent) {
- r = dev_printk_emit(level[1] - '0',
- dev->dev.parent,
- "%s %s %s%s: %pV",
- dev_driver_string(dev->dev.parent),
- dev_name(dev->dev.parent),
- netdev_name(dev), netdev_reg_state(dev),
- vaf);
+ dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s%s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), netdev_reg_state(dev),
+ vaf);
} else if (dev) {
- r = printk("%s%s%s: %pV", level, netdev_name(dev),
- netdev_reg_state(dev), vaf);
+ printk("%s%s%s: %pV",
+ level, netdev_name(dev), netdev_reg_state(dev), vaf);
} else {
- r = printk("%s(NULL net_device): %pV", level, vaf);
+ printk("%s(NULL net_device): %pV", level, vaf);
}
-
- return r;
}
-int netdev_printk(const char *level, const struct net_device *dev,
- const char *format, ...)
+void netdev_printk(const char *level, const struct net_device *dev,
+ const char *format, ...)
{
struct va_format vaf;
va_list args;
- int r;
va_start(args, format);
vaf.fmt = format;
vaf.va = &args;
- r = __netdev_printk(level, dev, &vaf);
+ __netdev_printk(level, dev, &vaf);
va_end(args);
-
- return r;
}
EXPORT_SYMBOL(netdev_printk);
#define define_netdev_printk_level(func, level) \
-int func(const struct net_device *dev, const char *fmt, ...) \
+void func(const struct net_device *dev, const char *fmt, ...) \
{ \
- int r; \
struct va_format vaf; \
va_list args; \
\
@@ -7071,11 +7132,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \
vaf.fmt = fmt; \
vaf.va = &args; \
\
- r = __netdev_printk(level, dev, &vaf); \
+ __netdev_printk(level, dev, &vaf); \
\
va_end(args); \
- \
- return r; \
} \
EXPORT_SYMBOL(func);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index cf999e09bcd2..72e899a3efda 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -365,11 +365,8 @@ void dev_load(struct net *net, const char *name)
no_module = !dev;
if (no_module && capable(CAP_NET_ADMIN))
no_module = request_module("netdev-%s", name);
- if (no_module && capable(CAP_SYS_MODULE)) {
- if (!request_module("%s", name))
- pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
- name);
- }
+ if (no_module && capable(CAP_SYS_MODULE))
+ request_module("%s", name);
}
EXPORT_SYMBOL(dev_load);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 17cb912793fa..1600aa24d36b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1621,6 +1621,81 @@ static int ethtool_get_module_eeprom(struct net_device *dev,
modinfo.eeprom_len);
}
+static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK:
+ if (tuna->len != sizeof(u32) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U32)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->get_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_tunable(dev, &tuna, data);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->set_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_from_user(data, useraddr, tuna.len))
+ goto out;
+ ret = ops->set_tunable(dev, &tuna, data);
+
+out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -1670,6 +1745,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GCHANNELS:
case ETHTOOL_GET_TS_INFO:
case ETHTOOL_GEEE:
+ case ETHTOOL_GTUNABLE:
break;
default:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1857,6 +1933,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GMODULEEEPROM:
rc = ethtool_get_module_eeprom(dev, useraddr);
break;
+ case ETHTOOL_GTUNABLE:
+ rc = ethtool_get_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_STUNABLE:
+ rc = ethtool_set_tunable(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index d814b8a89d0f..fcd3f6742a6a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -87,33 +87,9 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sk_filter);
-/* Helper to find the offset of pkt_type in sk_buff structure. We want
- * to make sure its still a 3bit field starting at a byte boundary;
- * taken from arch/x86/net/bpf_jit_comp.c.
- */
-#ifdef __BIG_ENDIAN_BITFIELD
-#define PKT_TYPE_MAX (7 << 5)
-#else
-#define PKT_TYPE_MAX 7
-#endif
-static unsigned int pkt_type_offset(void)
-{
- struct sk_buff skb_probe = { .pkt_type = ~0, };
- u8 *ct = (u8 *) &skb_probe;
- unsigned int off;
-
- for (off = 0; off < sizeof(struct sk_buff); off++) {
- if (ct[off] == PKT_TYPE_MAX)
- return off;
- }
-
- pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
- return -1;
-}
-
static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
{
- return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+ return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
}
static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
@@ -190,11 +166,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_PKTTYPE:
- *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
- pkt_type_offset());
- if (insn->off < 0)
- return false;
- insn++;
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
+ PKT_TYPE_OFFSET());
*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
insn++;
@@ -933,7 +906,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
/* Expand fp for appending the new filter representation. */
old_fp = fp;
- fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL);
+ fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
if (!fp) {
/* The old_fp is still around in case we couldn't
* allocate new memory, so uncharge on that one.
@@ -972,7 +945,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
int err;
fp->bpf_func = NULL;
- fp->jited = 0;
+ fp->jited = false;
err = bpf_check_classic(fp->insns, fp->len);
if (err) {
@@ -1013,7 +986,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
if (fprog->filter == NULL)
return -EINVAL;
- fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL);
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
if (!fp)
return -ENOMEM;
@@ -1069,12 +1042,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (fprog->filter == NULL)
return -EINVAL;
- prog = kmalloc(bpf_fsize, GFP_KERNEL);
+ prog = bpf_prog_alloc(bpf_fsize, 0);
if (!prog)
return -ENOMEM;
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
- kfree(prog);
+ __bpf_prog_free(prog);
return -EFAULT;
}
@@ -1082,7 +1055,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
- kfree(prog);
+ __bpf_prog_free(prog);
return -ENOMEM;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5f362c1d0332..8560dea58803 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -13,6 +13,7 @@
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <net/flow_keys.h>
+#include <scsi/fc/fc_fcoe.h>
/* copy saddr & daddr, possibly using 64bit load/store
* Equivalent to : flow->src = iph->saddr;
@@ -26,36 +27,61 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i
}
/**
- * skb_flow_get_ports - extract the upper layer ports and return them
- * @skb: buffer to extract the ports from
+ * __skb_flow_get_ports - extract the upper layer ports and return them
+ * @skb: sk_buff to extract the ports from
* @thoff: transport header offset
* @ip_proto: protocol for which to get port offset
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
*
* The function will try to retrieve the ports at offset thoff + poff where poff
* is the protocol port offset returned from proto_ports_offset
*/
-__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
+ if (!data) {
+ data = skb->data;
+ hlen = skb_headlen(skb);
+ }
+
if (poff >= 0) {
__be32 *ports, _ports;
- ports = skb_header_pointer(skb, thoff + poff,
- sizeof(_ports), &_ports);
+ ports = __skb_header_pointer(skb, thoff + poff,
+ sizeof(_ports), data, hlen, &_ports);
if (ports)
return *ports;
}
return 0;
}
-EXPORT_SYMBOL(skb_flow_get_ports);
+EXPORT_SYMBOL(__skb_flow_get_ports);
-bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
+/**
+ * __skb_flow_dissect - extract the flow_keys struct and return it
+ * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
+ * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ *
+ * The function will try to retrieve the struct flow_keys from either the skbuff
+ * or a raw buffer specified by the rest parameters
+ */
+bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+ void *data, __be16 proto, int nhoff, int hlen)
{
- int nhoff = skb_network_offset(skb);
u8 ip_proto;
- __be16 proto = skb->protocol;
+
+ if (!data) {
+ data = skb->data;
+ proto = skb->protocol;
+ nhoff = skb_network_offset(skb);
+ hlen = skb_headlen(skb);
+ }
memset(flow, 0, sizeof(*flow));
@@ -65,7 +91,7 @@ again:
const struct iphdr *iph;
struct iphdr _iph;
ip:
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph || iph->ihl < 5)
return false;
nhoff += iph->ihl * 4;
@@ -83,7 +109,7 @@ ip:
__be32 flow_label;
ipv6:
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph)
return false;
@@ -92,6 +118,13 @@ ipv6:
flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
nhoff += sizeof(struct ipv6hdr);
+ /* skip the flow label processing if skb is NULL. The
+ * assumption here is that if there is no skb we are not
+ * looking for flow info as much as we are length.
+ */
+ if (!skb)
+ break;
+
flow_label = ip6_flowlabel(iph);
if (flow_label) {
/* Awesome, IPv6 packet has a flow label so we can
@@ -113,7 +146,7 @@ ipv6:
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
- vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
+ vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
if (!vlan)
return false;
@@ -126,7 +159,7 @@ ipv6:
struct pppoe_hdr hdr;
__be16 proto;
} *hdr, _hdr;
- hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
return false;
proto = hdr->proto;
@@ -140,6 +173,9 @@ ipv6:
return false;
}
}
+ case htons(ETH_P_FCOE):
+ flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+ /* fall through */
default:
return false;
}
@@ -151,7 +187,7 @@ ipv6:
__be16 proto;
} *hdr, _hdr;
- hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
return false;
/*
@@ -171,8 +207,9 @@ ipv6:
const struct ethhdr *eth;
struct ethhdr _eth;
- eth = skb_header_pointer(skb, nhoff,
- sizeof(_eth), &_eth);
+ eth = __skb_header_pointer(skb, nhoff,
+ sizeof(_eth),
+ data, hlen, &_eth);
if (!eth)
return false;
proto = eth->h_proto;
@@ -194,12 +231,12 @@ ipv6:
flow->n_proto = proto;
flow->ip_proto = ip_proto;
- flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
+ flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
flow->thoff = (u16) nhoff;
return true;
}
-EXPORT_SYMBOL(skb_flow_dissect);
+EXPORT_SYMBOL(__skb_flow_dissect);
static u32 hashrnd __read_mostly;
static __always_inline void __flow_hash_secret_init(void)
@@ -286,30 +323,22 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
qcount = dev->tc_to_txq[tc].count;
}
- return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset;
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
-/* __skb_get_poff() returns the offset to the payload as far as it could
- * be dissected. The main user is currently BPF, so that we can dynamically
- * truncate packets without needing to push actual payload to the user
- * space and can analyze headers only, instead.
- */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+ const struct flow_keys *keys, int hlen)
{
- struct flow_keys keys;
- u32 poff = 0;
-
- if (!skb_flow_dissect(skb, &keys))
- return 0;
+ u32 poff = keys->thoff;
- poff += keys.thoff;
- switch (keys.ip_proto) {
+ switch (keys->ip_proto) {
case IPPROTO_TCP: {
const struct tcphdr *tcph;
struct tcphdr _tcph;
- tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph);
+ tcph = __skb_header_pointer(skb, poff, sizeof(_tcph),
+ data, hlen, &_tcph);
if (!tcph)
return poff;
@@ -343,6 +372,21 @@ u32 __skb_get_poff(const struct sk_buff *skb)
return poff;
}
+/* skb_get_poff() returns the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return 0;
+
+ return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
+}
+
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
@@ -359,9 +403,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
if (map->len == 1)
queue_index = map->queues[0];
else
- queue_index = map->queues[
- ((u64)skb_get_hash(skb) * map->len) >> 32];
-
+ queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+ map->len)];
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9d33dfffca19..9dfb88a933e7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -91,6 +91,8 @@ struct gen_estimator
u32 avpps;
struct rcu_head e_rcu;
struct rb_node node;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ struct rcu_head head;
};
struct gen_estimator_head
@@ -115,9 +117,8 @@ static void est_timer(unsigned long arg)
rcu_read_lock();
list_for_each_entry_rcu(e, &elist[idx].list, list) {
- u64 nbytes;
+ struct gnet_stats_basic_packed b = {0};
u64 brate;
- u32 npackets;
u32 rate;
spin_lock(e->stats_lock);
@@ -125,15 +126,15 @@ static void est_timer(unsigned long arg)
if (e->bstats == NULL)
goto skip;
- nbytes = e->bstats->bytes;
- npackets = e->bstats->packets;
- brate = (nbytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = nbytes;
+ __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+
+ brate = (b.bytes - e->last_bytes)<<(7 - idx);
+ e->last_bytes = b.bytes;
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
- rate = (npackets - e->last_packets)<<(12 - idx);
- e->last_packets = npackets;
+ rate = (b.packets - e->last_packets)<<(12 - idx);
+ e->last_packets = b.packets;
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
e->rate_est->pps = (e->avpps+0x1FF)>>10;
skip:
@@ -203,12 +204,14 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock,
struct nlattr *opt)
{
struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
+ struct gnet_stats_basic_packed b = {0};
int idx;
if (nla_len(opt) < sizeof(*parm))
@@ -221,15 +224,18 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (est == NULL)
return -ENOBUFS;
+ __gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+
idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->ewma_log = parm->ewma_log;
- est->last_bytes = bstats->bytes;
+ est->last_bytes = b.bytes;
est->avbps = rate_est->bps<<5;
- est->last_packets = bstats->packets;
+ est->last_packets = b.packets;
est->avpps = rate_est->pps<<10;
+ est->cpu_bstats = cpu_bstats;
spin_lock_bh(&est_tree_lock);
if (!elist[idx].timer.function) {
@@ -290,11 +296,12 @@ EXPORT_SYMBOL(gen_kill_estimator);
* Returns 0 on success or a negative error code.
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock, struct nlattr *opt)
{
gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 2ddbce4cce14..0c08062d1796 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -97,6 +97,43 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
}
EXPORT_SYMBOL(gnet_stats_start_copy);
+static void
+__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes;
+ u32 packets;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ bytes = bcpu->bstats.bytes;
+ packets = bcpu->bstats.packets;
+ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+
+ bstats->bytes += bytes;
+ bstats->packets += packets;
+ }
+}
+
+void
+__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
+{
+ if (cpu) {
+ __gnet_stats_copy_basic_cpu(bstats, cpu);
+ } else {
+ bstats->bytes = b->bytes;
+ bstats->packets = b->packets;
+ }
+}
+EXPORT_SYMBOL(__gnet_stats_copy_basic);
+
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @d: dumping handle
@@ -109,19 +146,25 @@ EXPORT_SYMBOL(gnet_stats_start_copy);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
+gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
{
+ struct gnet_stats_basic_packed bstats = {0};
+
+ __gnet_stats_copy_basic(&bstats, cpu, b);
+
if (d->compat_tc_stats) {
- d->tc_stats.bytes = b->bytes;
- d->tc_stats.packets = b->packets;
+ d->tc_stats.bytes = bstats.bytes;
+ d->tc_stats.packets = bstats.packets;
}
if (d->tail) {
struct gnet_stats_basic sb;
memset(&sb, 0, sizeof(sb));
- sb.bytes = b->bytes;
- sb.packets = b->packets;
+ sb.bytes = bstats.bytes;
+ sb.packets = bstats.packets;
return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
}
return 0;
@@ -172,29 +215,74 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
}
EXPORT_SYMBOL(gnet_stats_copy_rate_est);
+static void
+__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *q)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
+
+ qstats->qlen = 0;
+ qstats->backlog += qcpu->backlog;
+ qstats->drops += qcpu->drops;
+ qstats->requeues += qcpu->requeues;
+ qstats->overlimits += qcpu->overlimits;
+ }
+}
+
+static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q,
+ __u32 qlen)
+{
+ if (cpu) {
+ __gnet_stats_copy_queue_cpu(qstats, cpu);
+ } else {
+ qstats->qlen = q->qlen;
+ qstats->backlog = q->backlog;
+ qstats->drops = q->drops;
+ qstats->requeues = q->requeues;
+ qstats->overlimits = q->overlimits;
+ }
+
+ qstats->qlen = qlen;
+}
+
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
* @d: dumping handle
+ * @cpu_q: per cpu queue statistics
* @q: queue statistics
+ * @qlen: queue length statistics
*
* Appends the queue statistics to the top level TLV created by
- * gnet_stats_start_copy().
+ * gnet_stats_start_copy(). Using per cpu queue statistics if
+ * they are available.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
+gnet_stats_copy_queue(struct gnet_dump *d,
+ struct gnet_stats_queue __percpu *cpu_q,
+ struct gnet_stats_queue *q, __u32 qlen)
{
+ struct gnet_stats_queue qstats = {0};
+
+ __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
+
if (d->compat_tc_stats) {
- d->tc_stats.drops = q->drops;
- d->tc_stats.qlen = q->qlen;
- d->tc_stats.backlog = q->backlog;
- d->tc_stats.overlimits = q->overlimits;
+ d->tc_stats.drops = qstats.drops;
+ d->tc_stats.qlen = qstats.qlen;
+ d->tc_stats.backlog = qstats.backlog;
+ d->tc_stats.overlimits = qstats.overlimits;
}
if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q));
+ return gnet_stats_copy(d, TCA_STATS_QUEUE,
+ &qstats, sizeof(qstats));
return 0;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7c6b51a58968..7f155175bba8 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -224,7 +224,7 @@ static void net_free(struct net *net)
return;
}
#endif
- kfree(net->gen);
+ kfree(rcu_access_pointer(net->gen));
kmem_cache_free(net_cachep, net);
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 907fb5e36c02..e6645b4f330a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644);
static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
- const struct net_device_ops *ops = dev->netdev_ops;
int status = NETDEV_TX_OK;
netdev_features_t features;
@@ -92,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
skb->vlan_tci = 0;
}
- status = ops->ndo_start_xmit(skb, dev);
- if (status == NETDEV_TX_OK)
- txq_trans_update(txq);
+ status = netdev_start_xmit(skb, dev, txq, false);
out:
return status;
@@ -116,7 +113,7 @@ static void queue_process(struct work_struct *work)
continue;
}
- txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+ txq = skb_get_tx_queue(dev, skb);
local_irq_save(flags);
HARD_TX_LOCK(dev, txq, smp_processor_id());
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8b849ddfef2e..443256bdcddc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -202,6 +202,7 @@
#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
#define F_NODE (1<<15) /* Node memory alloc*/
#define F_UDPCSUM (1<<16) /* Include UDP checksum */
+#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */
/* Thread control flag bits */
#define T_STOP (1<<0) /* Stop run */
@@ -386,6 +387,7 @@ struct pktgen_dev {
u16 queue_map_min;
u16 queue_map_max;
__u32 skb_priority; /* skb priority field */
+ unsigned int burst; /* number of duplicated packets to burst */
int node; /* Memory node */
#ifdef CONFIG_XFRM
@@ -505,7 +507,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
pktgen_reset_all_threads(pn);
else
- pr_warning("Unknown command: %s\n", data);
+ pr_warn("Unknown command: %s\n", data);
return count;
}
@@ -612,6 +614,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->traffic_class)
seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
+ if (pkt_dev->burst > 1)
+ seq_printf(seq, " burst: %d\n", pkt_dev->burst);
+
if (pkt_dev->node >= 0)
seq_printf(seq, " node: %d\n", pkt_dev->node);
@@ -638,6 +643,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->flags & F_UDPCSUM)
seq_puts(seq, "UDPCSUM ");
+ if (pkt_dev->flags & F_NO_TIMESTAMP)
+ seq_puts(seq, "NO_TIMESTAMP ");
+
if (pkt_dev->flags & F_MPLS_RND)
seq_puts(seq, "MPLS_RND ");
@@ -857,14 +865,14 @@ static ssize_t pktgen_if_write(struct file *file,
pg_result = &(pkt_dev->result[0]);
if (count < 1) {
- pr_warning("wrong command format\n");
+ pr_warn("wrong command format\n");
return -EINVAL;
}
max = count;
tmp = count_trail_chars(user_buffer, max);
if (tmp < 0) {
- pr_warning("illegal format\n");
+ pr_warn("illegal format\n");
return tmp;
}
i = tmp;
@@ -1120,6 +1128,16 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->dst_mac_count);
return count;
}
+ if (!strcmp(name, "burst")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->burst = value < 1 ? 1 : value;
+ sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
+ return count;
+ }
if (!strcmp(name, "node")) {
len = num_arg(&user_buffer[i], 10, &value);
if (len < 0)
@@ -1243,6 +1261,9 @@ static ssize_t pktgen_if_write(struct file *file,
else if (strcmp(f, "!UDPCSUM") == 0)
pkt_dev->flags &= ~F_UDPCSUM;
+ else if (strcmp(f, "NO_TIMESTAMP") == 0)
+ pkt_dev->flags |= F_NO_TIMESTAMP;
+
else {
sprintf(pg_result,
"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
@@ -1251,6 +1272,7 @@ static ssize_t pktgen_if_write(struct file *file,
"MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
"MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
"QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+ "NO_TIMESTAMP, "
#ifdef CONFIG_XFRM
"IPSEC, "
#endif
@@ -2048,15 +2070,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
ntxq = pkt_dev->odev->real_num_tx_queues;
if (ntxq <= pkt_dev->queue_map_min) {
- pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
- pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
- pkt_dev->odevname);
+ pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
}
if (pkt_dev->queue_map_max >= ntxq) {
- pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
- pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
- pkt_dev->odevname);
+ pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
}
@@ -2685,9 +2707,14 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
pgh->pgh_magic = htonl(PKTGEN_MAGIC);
pgh->seq_num = htonl(pkt_dev->seq_num);
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
+ if (pkt_dev->flags & F_NO_TIMESTAMP) {
+ pgh->tv_sec = 0;
+ pgh->tv_usec = 0;
+ } else {
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+ }
}
static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
@@ -3160,8 +3187,8 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
if (!pkt_dev->running) {
- pr_warning("interface: %s is already stopped\n",
- pkt_dev->odevname);
+ pr_warn("interface: %s is already stopped\n",
+ pkt_dev->odevname);
return -EINVAL;
}
@@ -3284,11 +3311,9 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
{
+ unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
struct net_device *odev = pkt_dev->odev;
- netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *)
- = odev->netdev_ops->ndo_start_xmit;
struct netdev_queue *txq;
- u16 queue_map;
int ret;
/* If device is offline, then don't send */
@@ -3326,8 +3351,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->delay && pkt_dev->last_ok)
spin(pkt_dev, pkt_dev->next_tx);
- queue_map = skb_get_queue_mapping(pkt_dev->skb);
- txq = netdev_get_tx_queue(odev, queue_map);
+ txq = skb_get_tx_queue(odev, pkt_dev->skb);
local_bh_disable();
@@ -3338,16 +3362,19 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
goto unlock;
}
- atomic_inc(&(pkt_dev->skb->users));
- ret = (*xmit)(pkt_dev->skb, odev);
+ atomic_add(burst, &pkt_dev->skb->users);
+
+xmit_more:
+ ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
switch (ret) {
case NETDEV_TX_OK:
- txq_trans_update(txq);
pkt_dev->last_ok = 1;
pkt_dev->sofar++;
pkt_dev->seq_num++;
pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq))
+ goto xmit_more;
break;
case NET_XMIT_DROP:
case NET_XMIT_CN:
@@ -3366,6 +3393,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
atomic_dec(&(pkt_dev->skb->users));
pkt_dev->last_ok = 0;
}
+ if (unlikely(burst))
+ atomic_sub(burst, &pkt_dev->skb->users);
unlock:
HARD_TX_UNLOCK(odev, txq);
@@ -3564,6 +3593,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_p = 0;
pkt_dev->svlan_cfi = 0;
pkt_dev->svlan_id = 0xffff;
+ pkt_dev->burst = 1;
pkt_dev->node = -1;
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
@@ -3684,7 +3714,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
if (pkt_dev->running) {
- pr_warning("WARNING: trying to remove a running interface, stopping it now\n");
+ pr_warn("WARNING: trying to remove a running interface, stopping it now\n");
pktgen_stop_device(pkt_dev);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f0493e3b7471..a6882686ca3a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1481,9 +1481,12 @@ static int do_set_master(struct net_device *dev, int ifindex)
return 0;
}
+#define DO_SETLINK_MODIFIED 0x01
+/* notify flag means notify + modified. */
+#define DO_SETLINK_NOTIFY 0x03
static int do_setlink(const struct sk_buff *skb,
struct net_device *dev, struct ifinfomsg *ifm,
- struct nlattr **tb, char *ifname, int modified)
+ struct nlattr **tb, char *ifname, int status)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
@@ -1502,7 +1505,7 @@ static int do_setlink(const struct sk_buff *skb,
put_net(net);
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_MAP]) {
@@ -1531,7 +1534,7 @@ static int do_setlink(const struct sk_buff *skb,
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (tb[IFLA_ADDRESS]) {
@@ -1551,19 +1554,19 @@ static int do_setlink(const struct sk_buff *skb,
kfree(sa);
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_MTU]) {
err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_GROUP]) {
dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
/*
@@ -1575,7 +1578,7 @@ static int do_setlink(const struct sk_buff *skb,
err = dev_change_name(dev, ifname);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_IFALIAS]) {
@@ -1583,7 +1586,7 @@ static int do_setlink(const struct sk_buff *skb,
nla_len(tb[IFLA_IFALIAS]));
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (tb[IFLA_BROADCAST]) {
@@ -1601,25 +1604,35 @@ static int do_setlink(const struct sk_buff *skb,
err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_CARRIER]) {
err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
- if (tb[IFLA_TXQLEN])
- dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_TXQLEN]) {
+ unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
+
+ if (dev->tx_queue_len ^ value)
+ status |= DO_SETLINK_NOTIFY;
+
+ dev->tx_queue_len = value;
+ }
if (tb[IFLA_OPERSTATE])
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
if (tb[IFLA_LINKMODE]) {
+ unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
+
write_lock_bh(&dev_base_lock);
- dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (dev->link_mode ^ value)
+ status |= DO_SETLINK_NOTIFY;
+ dev->link_mode = value;
write_unlock_bh(&dev_base_lock);
}
@@ -1634,7 +1647,7 @@ static int do_setlink(const struct sk_buff *skb,
err = do_setvfinfo(dev, attr);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
@@ -1664,7 +1677,7 @@ static int do_setlink(const struct sk_buff *skb,
err = ops->ndo_set_vf_port(dev, vf, port);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
@@ -1682,7 +1695,7 @@ static int do_setlink(const struct sk_buff *skb,
err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (tb[IFLA_AF_SPEC]) {
@@ -1699,15 +1712,20 @@ static int do_setlink(const struct sk_buff *skb,
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
errout:
- if (err < 0 && modified)
- net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
- dev->name);
+ if (status & DO_SETLINK_MODIFIED) {
+ if (status & DO_SETLINK_NOTIFY)
+ netdev_state_change(dev);
+
+ if (err < 0)
+ net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
+ dev->name);
+ }
return err;
}
@@ -1989,7 +2007,7 @@ replay:
}
if (dev) {
- int modified = 0;
+ int status = 0;
if (nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
@@ -2004,7 +2022,7 @@ replay:
err = ops->changelink(dev, tb, data);
if (err < 0)
return err;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
@@ -2015,10 +2033,10 @@ replay:
tb, slave_data);
if (err < 0)
return err;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
- return do_setlink(skb, dev, ifm, tb, ifname, modified);
+ return do_setlink(skb, dev, ifm, tb, ifname, status);
}
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index ba71212f0251..51dd3193a33e 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq)
* overlaps less than one time per MSL (2 minutes).
* Choosing a clock of 64 ns period is OK. (period of 274 s)
*/
- return seq + (ktime_to_ns(ktime_get_real()) >> 6);
+ return seq + (ktime_get_real_ns() >> 6);
}
#endif
@@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
md5_transform(hash, net_secret);
seq = hash[0] | (((u64)hash[1]) << 32);
- seq += ktime_to_ns(ktime_get_real());
+ seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
return seq;
@@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
md5_transform(hash, secret);
seq = hash[0] | (((u64)hash[1]) << 32);
- seq += ktime_to_ns(ktime_get_real());
+ seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
return seq;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index da1378a3e2c7..7b3df0d518ab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -257,16 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
kmemcheck_annotate_variable(shinfo->destructor_arg);
if (flags & SKB_ALLOC_FCLONE) {
- struct sk_buff *child = skb + 1;
- atomic_t *fclone_ref = (atomic_t *) (child + 1);
+ struct sk_buff_fclones *fclones;
- kmemcheck_annotate_bitfield(child, flags1);
- kmemcheck_annotate_bitfield(child, flags2);
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
skb->fclone = SKB_FCLONE_ORIG;
- atomic_set(fclone_ref, 1);
+ atomic_set(&fclones->fclone_ref, 1);
- child->fclone = SKB_FCLONE_UNAVAILABLE;
- child->pfmemalloc = pfmemalloc;
+ fclones->skb2.fclone = SKB_FCLONE_FREE;
+ fclones->skb2.pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -491,32 +491,33 @@ static void skb_free_head(struct sk_buff *skb)
static void skb_release_data(struct sk_buff *skb)
{
- if (!skb->cloned ||
- !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
- &skb_shinfo(skb)->dataref)) {
- if (skb_shinfo(skb)->nr_frags) {
- int i;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- skb_frag_unref(skb, i);
- }
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int i;
- /*
- * If skb buf is from userspace, we need to notify the caller
- * the lower device DMA has done;
- */
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- struct ubuf_info *uarg;
+ if (skb->cloned &&
+ atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+ &shinfo->dataref))
+ return;
- uarg = skb_shinfo(skb)->destructor_arg;
- if (uarg->callback)
- uarg->callback(uarg, true);
- }
+ for (i = 0; i < shinfo->nr_frags; i++)
+ __skb_frag_unref(&shinfo->frags[i]);
- if (skb_has_frag_list(skb))
- skb_drop_fraglist(skb);
+ /*
+ * If skb buf is from userspace, we need to notify the caller
+ * the lower device DMA has done;
+ */
+ if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
- skb_free_head(skb);
+ uarg = shinfo->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, true);
}
+
+ if (shinfo->frag_list)
+ kfree_skb_list(shinfo->frag_list);
+
+ skb_free_head(skb);
}
/*
@@ -524,8 +525,7 @@ static void skb_release_data(struct sk_buff *skb)
*/
static void kfree_skbmem(struct sk_buff *skb)
{
- struct sk_buff *other;
- atomic_t *fclone_ref;
+ struct sk_buff_fclones *fclones;
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
@@ -533,22 +533,28 @@ static void kfree_skbmem(struct sk_buff *skb)
break;
case SKB_FCLONE_ORIG:
- fclone_ref = (atomic_t *) (skb + 2);
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, skb);
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+ if (atomic_dec_and_test(&fclones->fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, fclones);
break;
case SKB_FCLONE_CLONE:
- fclone_ref = (atomic_t *) (skb + 1);
- other = skb - 1;
+ fclones = container_of(skb, struct sk_buff_fclones, skb2);
- /* The clone portion is available for
- * fast-cloning again.
+ /* Warning : We must perform the atomic_dec_and_test() before
+ * setting skb->fclone back to SKB_FCLONE_FREE, otherwise
+ * skb_clone() could set clone_ref to 2 before our decrement.
+ * Anyway, if we are going to free the structure, no need to
+ * rewrite skb->fclone.
*/
- skb->fclone = SKB_FCLONE_UNAVAILABLE;
-
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, other);
+ if (atomic_dec_and_test(&fclones->fclone_ref)) {
+ kmem_cache_free(skbuff_fclone_cache, fclones);
+ } else {
+ /* The clone portion is available for
+ * fast-cloning again.
+ */
+ skb->fclone = SKB_FCLONE_FREE;
+ }
break;
}
}
@@ -566,7 +572,7 @@ static void skb_release_head_state(struct sk_buff *skb)
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb->nfct);
#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nf_bridge_put(skb->nf_bridge);
#endif
/* XXX: IS this still necessary? - JHS */
@@ -674,57 +680,61 @@ void consume_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(consume_skb);
+/* Make sure a field is enclosed inside headers_start/headers_end section */
+#define CHECK_SKB_FIELD(field) \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
+ offsetof(struct sk_buff, headers_start)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
+ offsetof(struct sk_buff, headers_end)); \
+
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
new->tstamp = old->tstamp;
+ /* We do not copy old->sk */
new->dev = old->dev;
- new->transport_header = old->transport_header;
- new->network_header = old->network_header;
- new->mac_header = old->mac_header;
- new->inner_protocol = old->inner_protocol;
- new->inner_transport_header = old->inner_transport_header;
- new->inner_network_header = old->inner_network_header;
- new->inner_mac_header = old->inner_mac_header;
+ memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
- skb_copy_hash(new, old);
- new->ooo_okay = old->ooo_okay;
- new->no_fcs = old->no_fcs;
- new->encapsulation = old->encapsulation;
- new->encap_hdr_csum = old->encap_hdr_csum;
- new->csum_valid = old->csum_valid;
- new->csum_complete_sw = old->csum_complete_sw;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
- memcpy(new->cb, old->cb, sizeof(old->cb));
- new->csum = old->csum;
- new->ignore_df = old->ignore_df;
- new->pkt_type = old->pkt_type;
- new->ip_summed = old->ip_summed;
- skb_copy_queue_mapping(new, old);
- new->priority = old->priority;
-#if IS_ENABLED(CONFIG_IP_VS)
- new->ipvs_property = old->ipvs_property;
+ __nf_copy(new, old, false);
+
+ /* Note : this field could be in headers_start/headers_end section
+ * It is not yet because we do not want to have a 16 bit hole
+ */
+ new->queue_mapping = old->queue_mapping;
+
+ memcpy(&new->headers_start, &old->headers_start,
+ offsetof(struct sk_buff, headers_end) -
+ offsetof(struct sk_buff, headers_start));
+ CHECK_SKB_FIELD(protocol);
+ CHECK_SKB_FIELD(csum);
+ CHECK_SKB_FIELD(hash);
+ CHECK_SKB_FIELD(priority);
+ CHECK_SKB_FIELD(skb_iif);
+ CHECK_SKB_FIELD(vlan_proto);
+ CHECK_SKB_FIELD(vlan_tci);
+ CHECK_SKB_FIELD(transport_header);
+ CHECK_SKB_FIELD(network_header);
+ CHECK_SKB_FIELD(mac_header);
+ CHECK_SKB_FIELD(inner_protocol);
+ CHECK_SKB_FIELD(inner_transport_header);
+ CHECK_SKB_FIELD(inner_network_header);
+ CHECK_SKB_FIELD(inner_mac_header);
+ CHECK_SKB_FIELD(mark);
+#ifdef CONFIG_NETWORK_SECMARK
+ CHECK_SKB_FIELD(secmark);
+#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ CHECK_SKB_FIELD(napi_id);
#endif
- new->pfmemalloc = old->pfmemalloc;
- new->protocol = old->protocol;
- new->mark = old->mark;
- new->skb_iif = old->skb_iif;
- __nf_copy(new, old);
#ifdef CONFIG_NET_SCHED
- new->tc_index = old->tc_index;
+ CHECK_SKB_FIELD(tc_index);
#ifdef CONFIG_NET_CLS_ACT
- new->tc_verd = old->tc_verd;
+ CHECK_SKB_FIELD(tc_verd);
#endif
#endif
- new->vlan_proto = old->vlan_proto;
- new->vlan_tci = old->vlan_tci;
- skb_copy_secmark(new, old);
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
- new->napi_id = old->napi_id;
-#endif
}
/*
@@ -855,17 +865,22 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
- struct sk_buff *n;
+ struct sk_buff_fclones *fclones = container_of(skb,
+ struct sk_buff_fclones,
+ skb1);
+ struct sk_buff *n = &fclones->skb2;
if (skb_orphan_frags(skb, gfp_mask))
return NULL;
- n = skb + 1;
if (skb->fclone == SKB_FCLONE_ORIG &&
- n->fclone == SKB_FCLONE_UNAVAILABLE) {
- atomic_t *fclone_ref = (atomic_t *) (n + 1);
+ n->fclone == SKB_FCLONE_FREE) {
n->fclone = SKB_FCLONE_CLONE;
- atomic_inc(fclone_ref);
+ /* As our fastclone was free, clone_ref must be 1 at this point.
+ * We could use atomic_inc() here, but it is faster
+ * to set the final value.
+ */
+ atomic_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -875,7 +890,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
return NULL;
kmemcheck_annotate_bitfield(n, flags1);
- kmemcheck_annotate_bitfield(n, flags2);
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
@@ -3069,6 +3083,11 @@ perform_csum_check:
}
} while ((offset += len) < head_skb->len);
+ /* Some callers want to get the end of the list.
+ * Put it in segs->prev to avoid walking the list.
+ * (see validate_xmit_skb_list() for example)
+ */
+ segs->prev = tail;
return segs;
err:
@@ -3152,6 +3171,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
}
+ /* switch back to head shinfo */
+ pinfo = skb_shinfo(p);
+
if (pinfo->frag_list)
goto merge;
if (skb_gro_len(p) != pinfo->gso_size)
@@ -3179,7 +3201,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
skb_shinfo(nskb)->frag_list = p;
skb_shinfo(nskb)->gso_size = pinfo->gso_size;
pinfo->gso_size = 0;
- skb_header_release(p);
+ __skb_header_release(p);
NAPI_GRO_CB(nskb)->last = p;
nskb->data_len += p->len;
@@ -3211,7 +3233,7 @@ merge:
else
NAPI_GRO_CB(p)->last->next = skb;
NAPI_GRO_CB(p)->last = skb;
- skb_header_release(skb);
+ __skb_header_release(skb);
lp = p;
done:
@@ -3227,7 +3249,6 @@ done:
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
-EXPORT_SYMBOL_GPL(skb_gro_receive);
void __init skb_init(void)
{
@@ -3237,8 +3258,7 @@ void __init skb_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
- (2*sizeof(struct sk_buff)) +
- sizeof(atomic_t),
+ sizeof(struct sk_buff_fclones),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
@@ -3491,32 +3511,66 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
-void __skb_tstamp_tx(struct sk_buff *orig_skb,
- struct skb_shared_hwtstamps *hwtstamps,
- struct sock *sk, int tstype)
+struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
- struct sock_exterr_skb *serr;
- struct sk_buff *skb;
- int err;
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *skb_next;
+ int err = 0;
- if (!sk)
- return;
+ spin_lock_bh(&q->lock);
+ skb = __skb_dequeue(q);
+ if (skb && (skb_next = skb_peek(q)))
+ err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ spin_unlock_bh(&q->lock);
- if (hwtstamps) {
- *skb_hwtstamps(orig_skb) =
- *hwtstamps;
- } else {
- /*
- * no hardware time stamps available,
- * so keep the shared tx_flags and only
- * store software time stamp
- */
- orig_skb->tstamp = ktime_get_real();
+ sk->sk_err = err;
+ if (err)
+ sk->sk_error_report(sk);
+
+ return skb;
+}
+EXPORT_SYMBOL(sock_dequeue_err_skb);
+
+/**
+ * skb_clone_sk - create clone of skb, and take reference to socket
+ * @skb: the skb to clone
+ *
+ * This function creates a clone of a buffer that holds a reference on
+ * sk_refcnt. Buffers created via this function are meant to be
+ * returned using sock_queue_err_skb, or free via kfree_skb.
+ *
+ * When passing buffers allocated with this function to sock_queue_err_skb
+ * it is necessary to wrap the call with sock_hold/sock_put in order to
+ * prevent the socket from being released prior to being enqueued on
+ * the sk_error_queue.
+ */
+struct sk_buff *skb_clone_sk(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct sk_buff *clone;
+
+ if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return NULL;
}
- skb = skb_clone(orig_skb, GFP_ATOMIC);
- if (!skb)
- return;
+ clone->sk = sk;
+ clone->destructor = sock_efree;
+
+ return clone;
+}
+EXPORT_SYMBOL(skb_clone_sk);
+
+static void __skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct sock *sk,
+ int tstype)
+{
+ struct sock_exterr_skb *serr;
+ int err;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
@@ -3534,6 +3588,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (err)
kfree_skb(skb);
}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = skb->sk;
+
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
+
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ struct sk_buff *skb;
+
+ if (!sk)
+ return;
+
+ if (hwtstamps)
+ *skb_hwtstamps(orig_skb) = *hwtstamps;
+ else
+ orig_skb->tstamp = ktime_get_real();
+
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ __skb_complete_tx_timestamp(skb, sk, tstype);
+}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
@@ -3558,9 +3648,14 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
err = sock_queue_err_skb(sk, skb);
if (err)
kfree_skb(skb);
+
+ sock_put(sk);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
@@ -3861,7 +3956,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
return false;
if (len <= skb_tailroom(to)) {
- BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ if (len)
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
return true;
}
@@ -4026,3 +4122,81 @@ err_free:
return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);
+
+/**
+ * alloc_skb_with_frags - allocate skb with page frags
+ *
+ * header_len: size of linear part
+ * data_len: needed length in frags
+ * max_page_order: max page order desired.
+ * errcode: pointer to error code if any
+ * gfp_mask: allocation mask
+ *
+ * This can be used to allocate a paged skb, given a maximal order for frags.
+ */
+struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
+ unsigned long data_len,
+ int max_page_order,
+ int *errcode,
+ gfp_t gfp_mask)
+{
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ unsigned long chunk;
+ struct sk_buff *skb;
+ struct page *page;
+ gfp_t gfp_head;
+ int i;
+
+ *errcode = -EMSGSIZE;
+ /* Note this test could be relaxed, if we succeed to allocate
+ * high order pages...
+ */
+ if (npages > MAX_SKB_FRAGS)
+ return NULL;
+
+ gfp_head = gfp_mask;
+ if (gfp_head & __GFP_WAIT)
+ gfp_head |= __GFP_REPEAT;
+
+ *errcode = -ENOBUFS;
+ skb = alloc_skb(header_len, gfp_head);
+ if (!skb)
+ return NULL;
+
+ skb->truesize += npages << PAGE_SHIFT;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages(gfp_mask |
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
+ order);
+ if (page)
+ goto fill_page;
+ /* Do not retry other high order allocations */
+ order = 1;
+ max_page_order = 0;
+ }
+ order--;
+ }
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
+ }
+ return skb;
+
+failure:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_skb_with_frags);
diff --git a/net/core/sock.c b/net/core/sock.c
index d372b4bd3f99..b4f3ea2fce60 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -437,7 +437,6 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err;
- int skb_len;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
@@ -459,13 +458,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
skb->dev = NULL;
skb_set_owner_r(skb, sk);
- /* Cache the SKB length before we tack it onto the receive
- * queue. Once it is added it no longer belongs to us and
- * may be freed by other threads of control pulling packets
- * from the queue.
- */
- skb_len = skb->len;
-
/* we escape from rcu protected region, make sure we dont leak
* a norefcounted dst
*/
@@ -1489,9 +1481,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
atomic_set(&newsk->sk_omem_alloc, 0);
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
-#ifdef CONFIG_NET_DMA
- skb_queue_head_init(&newsk->sk_async_wait_queue);
-#endif
spin_lock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock);
@@ -1645,18 +1634,24 @@ void sock_rfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_rfree);
+void sock_efree(struct sk_buff *skb)
+{
+ sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_efree);
+
+#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
-#ifdef CONFIG_INET
if (sk->sk_state == TCP_TIME_WAIT)
inet_twsk_put(inet_twsk(sk));
else
-#endif
sock_put(sk);
}
EXPORT_SYMBOL(sock_edemux);
+#endif
kuid_t sock_i_uid(struct sock *sk)
{
@@ -1764,21 +1759,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
unsigned long data_len, int noblock,
int *errcode, int max_page_order)
{
- struct sk_buff *skb = NULL;
- unsigned long chunk;
- gfp_t gfp_mask;
+ struct sk_buff *skb;
long timeo;
int err;
- int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
- struct page *page;
- int i;
-
- err = -EMSGSIZE;
- if (npages > MAX_SKB_FRAGS)
- goto failure;
timeo = sock_sndtimeo(sk, noblock);
- while (!skb) {
+ for (;;) {
err = sock_error(sk);
if (err != 0)
goto failure;
@@ -1787,66 +1773,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- err = -EAGAIN;
- if (!timeo)
- goto failure;
- if (signal_pending(current))
- goto interrupted;
- timeo = sock_wait_for_wmem(sk, timeo);
- continue;
- }
-
- err = -ENOBUFS;
- gfp_mask = sk->sk_allocation;
- if (gfp_mask & __GFP_WAIT)
- gfp_mask |= __GFP_REPEAT;
+ if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ break;
- skb = alloc_skb(header_len, gfp_mask);
- if (!skb)
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
goto failure;
-
- skb->truesize += data_len;
-
- for (i = 0; npages > 0; i++) {
- int order = max_page_order;
-
- while (order) {
- if (npages >= 1 << order) {
- page = alloc_pages(sk->sk_allocation |
- __GFP_COMP |
- __GFP_NOWARN |
- __GFP_NORETRY,
- order);
- if (page)
- goto fill_page;
- /* Do not retry other high order allocations */
- order = 1;
- max_page_order = 0;
- }
- order--;
- }
- page = alloc_page(sk->sk_allocation);
- if (!page)
- goto failure;
-fill_page:
- chunk = min_t(unsigned long, data_len,
- PAGE_SIZE << order);
- skb_fill_page_desc(skb, i, page, 0, chunk);
- data_len -= chunk;
- npages -= 1 << order;
- }
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
}
-
- skb_set_owner_w(skb, sk);
+ skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
+ errcode, sk->sk_allocation);
+ if (skb)
+ skb_set_owner_w(skb, sk);
return skb;
interrupted:
err = sock_intr_errno(timeo);
failure:
- kfree_skb(skb);
*errcode = err;
return NULL;
}
@@ -1866,7 +1813,7 @@ EXPORT_SYMBOL(sock_alloc_send_skb);
* skb_page_frag_refill - check that a page_frag contains enough room
* @sz: minimum size of the fragment we want to get
* @pfrag: pointer to page_frag
- * @prio: priority for memory allocation
+ * @gfp: priority for memory allocation
*
* Note: While this allocator tries to use high order pages, there is
* no guarantee that allocations succeed. Therefore, @sz MUST be
@@ -2308,9 +2255,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
skb_queue_head_init(&sk->sk_receive_queue);
skb_queue_head_init(&sk->sk_write_queue);
skb_queue_head_init(&sk->sk_error_queue);
-#ifdef CONFIG_NET_DMA
- skb_queue_head_init(&sk->sk_async_wait_queue);
-#endif
sk->sk_send_head = NULL;
@@ -2498,11 +2442,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
int level, int type)
{
struct sock_exterr_skb *serr;
- struct sk_buff *skb, *skb2;
+ struct sk_buff *skb;
int copied, err;
err = -EAGAIN;
- skb = skb_dequeue(&sk->sk_error_queue);
+ skb = sock_dequeue_err_skb(sk);
if (skb == NULL)
goto out;
@@ -2523,16 +2467,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
- /* Reset and regenerate socket error */
- spin_lock_bh(&sk->sk_error_queue.lock);
- sk->sk_err = 0;
- if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
- sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_bh(&sk->sk_error_queue.lock);
- sk->sk_error_report(sk);
- } else
- spin_unlock_bh(&sk->sk_error_queue.lock);
-
out_free_skb:
kfree_skb(skb);
out:
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index a8770391ea5b..43d3dd62fcc8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
{
struct phy_device *phydev;
struct sk_buff *clone;
- struct sock *sk = skb->sk;
unsigned int type;
- if (!sk)
+ if (!skb->sk)
return;
type = classify(skb);
@@ -48,50 +47,14 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
phydev = skb->dev->phydev;
if (likely(phydev->drv->txtstamp)) {
- if (!atomic_inc_not_zero(&sk->sk_refcnt))
+ clone = skb_clone_sk(skb);
+ if (!clone)
return;
-
- clone = skb_clone(skb, GFP_ATOMIC);
- if (!clone) {
- sock_put(sk);
- return;
- }
-
- clone->sk = sk;
phydev->drv->txtstamp(phydev, clone, type);
}
}
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
-void skb_complete_tx_timestamp(struct sk_buff *skb,
- struct skb_shared_hwtstamps *hwtstamps)
-{
- struct sock *sk = skb->sk;
- struct sock_exterr_skb *serr;
- int err;
-
- if (!hwtstamps) {
- sock_put(sk);
- kfree_skb(skb);
- return;
- }
-
- *skb_hwtstamps(skb) = *hwtstamps;
-
- serr = SKB_EXT_ERR(skb);
- memset(serr, 0, sizeof(*serr));
- serr->ee.ee_errno = ENOMSG;
- serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
- skb->sk = NULL;
-
- err = sock_queue_err_skb(sk, skb);
-
- sock_put(sk);
- if (err)
- kfree_skb(skb);
-}
-EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
-
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
struct phy_device *phydev;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
deleted file mode 100644
index 1b5fefdb8198..000000000000
--- a/net/core/user_dma.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- * Portions based on net/core/datagram.c and copyrighted by their authors.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-/*
- * This code allows the net stack to make use of a DMA engine for
- * skb to iovec copies.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/socket.h>
-#include <linux/export.h>
-#include <net/tcp.h>
-#include <net/netdma.h>
-
-#define NET_DMA_DEFAULT_COPYBREAK 4096
-
-int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
-EXPORT_SYMBOL(sysctl_tcp_dma_copybreak);
-
-/**
- * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
- * @skb - buffer to copy
- * @offset - offset in the buffer to start copying from
- * @iovec - io vector to copy to
- * @len - amount of data to copy from buffer to iovec
- * @pinned_list - locked iovec buffer data
- *
- * Note: the iovec is modified during the copy.
- */
-int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
- struct sk_buff *skb, int offset, struct iovec *to,
- size_t len, struct dma_pinned_list *pinned_list)
-{
- int start = skb_headlen(skb);
- int i, copy = start - offset;
- struct sk_buff *frag_iter;
- dma_cookie_t cookie = 0;
-
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
- skb->data + offset, copy);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
-
- /* Copy paged appendix. Hmm... why does this look so complicated? */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
- const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
- WARN_ON(start > offset + len);
-
- end = start + skb_frag_size(frag);
- copy = end - offset;
- if (copy > 0) {
- struct page *page = skb_frag_page(frag);
-
- if (copy > len)
- copy = len;
-
- cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
- frag->page_offset + offset - start, copy);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
- start = end;
- }
-
- skb_walk_frags(skb, frag_iter) {
- int end;
-
- WARN_ON(start > offset + len);
-
- end = start + frag_iter->len;
- copy = end - offset;
- if (copy > 0) {
- if (copy > len)
- copy = len;
- cookie = dma_skb_copy_datagram_iovec(chan, frag_iter,
- offset - start,
- to, copy,
- pinned_list);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
- start = end;
- }
-
-end:
- if (!len) {
- skb->dma_cookie = cookie;
- return cookie;
- }
-
-fault:
- return -EFAULT;
-}
diff --git a/net/core/utils.c b/net/core/utils.c
index eed34338736c..efc76dd9dcd1 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -306,16 +306,14 @@ EXPORT_SYMBOL(in6_pton);
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, int pseudohdr)
{
- __be32 diff[] = { ~from, to };
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- *sum = csum_fold(csum_partial(diff, sizeof(diff),
- ~csum_unfold(*sum)));
+ *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
+ to));
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_partial(diff, sizeof(diff),
- ~skb->csum);
+ skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
} else if (pseudohdr)
- *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
- csum_unfold(*sum)));
+ *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
+ to));
}
EXPORT_SYMBOL(inet_proto_csum_replace4);