aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c301
-rw-r--r--net/core/dev.c172
-rw-r--r--net/core/dev_addr_lists.c4
-rw-r--r--net/core/dev_ioctl.c9
-rw-r--r--net/core/dst.c24
-rw-r--r--net/core/ethtool.c81
-rw-r--r--net/core/filter.c97
-rw-r--r--net/core/iovec.c47
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/neighbour.c279
-rw-r--r--net/core/net-sysfs.c44
-rw-r--r--net/core/netpoll.c4
-rw-r--r--net/core/pktgen.c3
-rw-r--r--net/core/rtnetlink.c154
-rw-r--r--net/core/scm.c3
-rw-r--r--net/core/skbuff.c347
-rw-r--r--net/core/sock.c20
-rw-r--r--net/core/sysctl_net_core.c21
-rw-r--r--net/core/utils.c3
19 files changed, 985 insertions, 630 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a81d4c2..df493d68330c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/uio.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -309,16 +310,14 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
EXPORT_SYMBOL(skb_kill_datagram);
/**
- * skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
- * @to: io vector to copy to
+ * @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
- *
- * Note: the iovec is modified during the copy.
*/
-int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
- struct iovec *to, int len)
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -330,8 +329,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (memcpy_toiovec(to, skb->data + offset, copy))
- goto fault;
+ if (copy_to_iter(skb->data + offset, copy, to) != copy)
+ goto short_copy;
if ((len -= copy) == 0)
return 0;
offset += copy;
@@ -346,18 +345,12 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- struct page *page = skb_frag_page(frag);
-
if (copy > len)
copy = len;
- vaddr = kmap(page);
- err = memcpy_toiovec(to, vaddr + frag->page_offset +
- offset - start, copy);
- kunmap(page);
- if (err)
- goto fault;
+ if (copy_page_to_iter(skb_frag_page(frag),
+ frag->page_offset + offset -
+ start, copy, to) != copy)
+ goto short_copy;
if (!(len -= copy))
return 0;
offset += copy;
@@ -374,9 +367,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (skb_copy_datagram_iovec(frag_iter,
- offset - start,
- to, copy))
+ if (skb_copy_datagram_iter(frag_iter, offset - start,
+ to, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
@@ -387,113 +379,33 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
if (!len)
return 0;
+ /* This is not really a user copy fault, but rather someone
+ * gave us a bogus length on the skb. We should probably
+ * print a warning here as it may indicate a kernel bug.
+ */
+
fault:
return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_iovec);
-/**
- * skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- * @skb: buffer to copy
- * @offset: offset in the buffer to start copying from
- * @to: io vector to copy to
- * @to_offset: offset in the io vector to start copying to
- * @len: amount of data to copy from buffer to iovec
- *
- * Returns 0 or -EFAULT.
- * Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
- const struct iovec *to, int to_offset,
- int len)
-{
- int start = skb_headlen(skb);
- int i, copy = start - offset;
- struct sk_buff *frag_iter;
+short_copy:
+ if (iov_iter_count(to))
+ goto fault;
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to_offset += copy;
- }
-
- /* Copy paged appendix. Hmm... why does this look so complicated? */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
- const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
- WARN_ON(start > offset + len);
-
- end = start + skb_frag_size(frag);
- if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- struct page *page = skb_frag_page(frag);
-
- if (copy > len)
- copy = len;
- vaddr = kmap(page);
- err = memcpy_toiovecend(to, vaddr + frag->page_offset +
- offset - start, to_offset, copy);
- kunmap(page);
- if (err)
- goto fault;
- if (!(len -= copy))
- return 0;
- offset += copy;
- to_offset += copy;
- }
- start = end;
- }
-
- skb_walk_frags(skb, frag_iter) {
- int end;
-
- WARN_ON(start > offset + len);
-
- end = start + frag_iter->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_copy_datagram_const_iovec(frag_iter,
- offset - start,
- to, to_offset,
- copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to_offset += copy;
- }
- start = end;
- }
- if (!len)
- return 0;
-
-fault:
- return -EFAULT;
+ return 0;
}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
+EXPORT_SYMBOL(skb_copy_datagram_iter);
/**
- * skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
+ * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying to
- * @from: io vector to copy to
- * @from_offset: offset in the io vector to start copying from
+ * @from: the copy source
* @len: amount of data to copy to buffer from iovec
*
* Returns 0 or -EFAULT.
- * Note: the iovec is not modified during the copy.
*/
-int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
- const struct iovec *from, int from_offset,
+int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+ struct iov_iter *from,
int len)
{
int start = skb_headlen(skb);
@@ -504,13 +416,11 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
- copy))
+ if (copy_from_iter(skb->data + offset, copy, from) != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
- from_offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -522,24 +432,19 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- struct page *page = skb_frag_page(frag);
+ size_t copied;
if (copy > len)
copy = len;
- vaddr = kmap(page);
- err = memcpy_fromiovecend(vaddr + frag->page_offset +
- offset - start,
- from, from_offset, copy);
- kunmap(page);
- if (err)
+ copied = copy_page_from_iter(skb_frag_page(frag),
+ frag->page_offset + offset - start,
+ copy, from);
+ if (copied != copy)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
- from_offset += copy;
}
start = end;
}
@@ -553,16 +458,13 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (skb_copy_datagram_from_iovec(frag_iter,
- offset - start,
- from,
- from_offset,
- copy))
+ if (skb_copy_datagram_from_iter(frag_iter,
+ offset - start,
+ from, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
- from_offset += copy;
}
start = end;
}
@@ -572,101 +474,82 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
fault:
return -EFAULT;
}
-EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
+EXPORT_SYMBOL(skb_copy_datagram_from_iter);
/**
- * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
* @skb: buffer to copy
- * @from: io vector to copy from
- * @offset: offset in the io vector to start copying from
- * @count: amount of vectors to copy to buffer from
+ * @from: the source to copy from
*
* The function will first copy up to headlen, and then pin the userspace
* pages and build frags through them.
*
* Returns 0, -EFAULT or -EMSGSIZE.
- * Note: the iovec is not modified during the copy
*/
-int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
- int offset, size_t count)
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
{
- int len = iov_length(from, count) - offset;
+ int len = iov_iter_count(from);
int copy = min_t(int, skb_headlen(skb), len);
- int size;
- int i = 0;
+ int frag = 0;
/* copy up to skb headlen */
- if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- if (len == copy)
- return 0;
-
- offset += copy;
- while (count--) {
- struct page *page[MAX_SKB_FRAGS];
- int num_pages;
- unsigned long base;
+ while (iov_iter_count(from)) {
+ struct page *pages[MAX_SKB_FRAGS];
+ size_t start;
+ ssize_t copied;
unsigned long truesize;
+ int n = 0;
- /* Skip over from offset and copied */
- if (offset >= from->iov_len) {
- offset -= from->iov_len;
- ++from;
- continue;
- }
- len = from->iov_len - offset;
- base = (unsigned long)from->iov_base + offset;
- size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
- if (i + size > MAX_SKB_FRAGS)
+ if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
- num_pages = get_user_pages_fast(base, size, 0, &page[i]);
- if (num_pages != size) {
- release_pages(&page[i], num_pages, 0);
+
+ copied = iov_iter_get_pages(from, pages, ~0U,
+ MAX_SKB_FRAGS - frag, &start);
+ if (copied < 0)
return -EFAULT;
- }
- truesize = size * PAGE_SIZE;
- skb->data_len += len;
- skb->len += len;
+
+ iov_iter_advance(from, copied);
+
+ truesize = PAGE_ALIGN(copied + start);
+ skb->data_len += copied;
+ skb->len += copied;
skb->truesize += truesize;
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
- while (len) {
- int off = base & ~PAGE_MASK;
- int size = min_t(int, len, PAGE_SIZE - off);
- skb_fill_page_desc(skb, i, page[i], off, size);
- base += size;
- len -= size;
- i++;
+ while (copied) {
+ int size = min_t(int, copied, PAGE_SIZE - start);
+ skb_fill_page_desc(skb, frag++, pages[n], start, size);
+ start = 0;
+ copied -= size;
+ n++;
}
- offset = 0;
- ++from;
}
return 0;
}
-EXPORT_SYMBOL(zerocopy_sg_from_iovec);
+EXPORT_SYMBOL(zerocopy_sg_from_iter);
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
- u8 __user *to, int len,
+ struct iov_iter *to, int len,
__wsum *csump)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
+ int n;
/* Copy header. */
if (copy > 0) {
- int err = 0;
if (copy > len)
copy = len;
- *csump = csum_and_copy_to_user(skb->data + offset, to, copy,
- *csump, &err);
- if (err)
+ n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+ if (n != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
- to += copy;
pos = copy;
}
@@ -678,26 +561,22 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- __wsum csum2;
- int err = 0;
- u8 *vaddr;
+ __wsum csum2 = 0;
struct page *page = skb_frag_page(frag);
+ u8 *vaddr = kmap(page);
if (copy > len)
copy = len;
- vaddr = kmap(page);
- csum2 = csum_and_copy_to_user(vaddr +
- frag->page_offset +
- offset - start,
- to, copy, 0, &err);
+ n = csum_and_copy_to_iter(vaddr + frag->page_offset +
+ offset - start, copy,
+ &csum2, to);
kunmap(page);
- if (err)
+ if (n != copy)
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if (!(len -= copy))
return 0;
offset += copy;
- to += copy;
pos += copy;
}
start = end;
@@ -722,7 +601,6 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
if ((len -= copy) == 0)
return 0;
offset += copy;
- to += copy;
pos += copy;
}
start = end;
@@ -775,20 +653,19 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
EXPORT_SYMBOL(__skb_checksum_complete);
/**
- * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
+ * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
- * @iov: io vector
+ * @msg: destination
*
* Caller _must_ check that skb will fit to this iovec.
*
* Returns: 0 - success.
* -EINVAL - checksum failure.
- * -EFAULT - fault during copy. Beware, in this case iovec
- * can be modified!
+ * -EFAULT - fault during copy.
*/
-int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
- int hlen, struct iovec *iov)
+int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
+ int hlen, struct msghdr *msg)
{
__wsum csum;
int chunk = skb->len - hlen;
@@ -796,28 +673,20 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
if (!chunk)
return 0;
- /* Skip filled elements.
- * Pretty silly, look at memcpy_toiovec, though 8)
- */
- while (!iov->iov_len)
- iov++;
-
- if (iov->iov_len < chunk) {
+ if (iov_iter_count(&msg->msg_iter) < chunk) {
if (__skb_checksum_complete(skb))
goto csum_error;
- if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
+ if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
goto fault;
} else {
csum = csum_partial(skb->data, hlen, skb->csum);
- if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
+ if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
chunk, &csum))
goto fault;
if (csum_fold(csum))
goto csum_error;
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
netdev_rx_csum_fault(skb->dev);
- iov->iov_len -= chunk;
- iov->iov_base += chunk;
}
return 0;
csum_error:
@@ -825,7 +694,7 @@ csum_error:
fault:
return -EFAULT;
}
-EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
/**
* datagram_poll - generic datagram poll
diff --git a/net/core/dev.c b/net/core/dev.c
index 3acff0974560..f411c28d0a66 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
+#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
@@ -133,6 +134,7 @@
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
+#include <linux/hrtimer.h>
#include "net-sysfs.h"
@@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close);
*/
void dev_disable_lro(struct net_device *dev)
{
- /*
- * If we're trying to disable lro on a vlan device
- * use the underlying physical device instead
- */
- if (is_vlan_dev(dev))
- dev = vlan_dev_real_dev(dev);
-
- /* the same for macvlan devices */
- if (netif_is_macvlan(dev))
- dev = macvlan_dev_real_dev(dev);
+ struct net_device *lower_dev;
+ struct list_head *iter;
dev->wanted_features &= ~NETIF_F_LRO;
netdev_update_features(dev);
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter)
+ dev_disable_lro(lower_dev);
}
EXPORT_SYMBOL(dev_disable_lro);
@@ -2530,7 +2527,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{
- if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ if (eth_p_mpls(type))
features &= skb->dev->mpls_features;
return features;
@@ -2647,12 +2644,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
netdev_features_t features)
{
if (vlan_tx_tag_present(skb) &&
- !vlan_hw_offload_capable(features, skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto,
- vlan_tx_tag_get(skb));
- if (skb)
- skb->vlan_tci = 0;
- }
+ !vlan_hw_offload_capable(features, skb->vlan_proto))
+ skb = __vlan_hwaccel_push_inside(skb);
return skb;
}
@@ -3304,7 +3297,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
rps_lock(sd);
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
- if (skb_queue_len(&sd->input_pkt_queue)) {
+ if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
@@ -4179,7 +4172,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
struct sk_buff *skb = napi->skb;
if (!skb) {
- skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
napi->skb = skb;
}
return skb;
@@ -4316,20 +4309,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
local_irq_enable();
}
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ return sd->rps_ipi_list != NULL;
+#else
+ return false;
+#endif
+}
+
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
-#ifdef CONFIG_RPS
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
- if (sd->rps_ipi_list) {
+ if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
-#endif
+
napi->weight = weight_p;
local_irq_disable();
while (1) {
@@ -4356,7 +4357,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
- list_del(&napi->poll_list);
napi->state = 0;
rps_unlock(sd);
@@ -4376,7 +4376,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
- * The entry's receive function will be scheduled to run
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
void __napi_schedule(struct napi_struct *n)
{
@@ -4388,18 +4389,29 @@ void __napi_schedule(struct napi_struct *n)
}
EXPORT_SYMBOL(__napi_schedule);
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
void __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
- BUG_ON(n->gro_list);
- list_del(&n->poll_list);
+ list_del_init(&n->poll_list);
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
-void napi_complete(struct napi_struct *n)
+void napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags;
@@ -4410,12 +4422,28 @@ void napi_complete(struct napi_struct *n)
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
- napi_gro_flush(n, false);
- local_irq_save(flags);
- __napi_complete(n);
- local_irq_restore(flags);
+ if (n->gro_list) {
+ unsigned long timeout = 0;
+
+ if (work_done)
+ timeout = n->dev->gro_flush_timeout;
+
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ else
+ napi_gro_flush(n, false);
+ }
+ if (likely(list_empty(&n->poll_list))) {
+ WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+ } else {
+ /* If n->poll_list is not empty, we need to mask irqs */
+ local_irq_save(flags);
+ __napi_complete(n);
+ local_irq_restore(flags);
+ }
}
-EXPORT_SYMBOL(napi_complete);
+EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4469,10 +4497,23 @@ void napi_hash_del(struct napi_struct *napi)
}
EXPORT_SYMBOL_GPL(napi_hash_del);
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+ struct napi_struct *napi;
+
+ napi = container_of(timer, struct napi_struct, timer);
+ if (napi->gro_list)
+ napi_schedule(napi);
+
+ return HRTIMER_NORESTART;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
+ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->timer.function = napi_watchdog;
napi->gro_count = 0;
napi->gro_list = NULL;
napi->skb = NULL;
@@ -4491,6 +4532,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
}
EXPORT_SYMBOL(netif_napi_add);
+void napi_disable(struct napi_struct *n)
+{
+ might_sleep();
+ set_bit(NAPI_STATE_DISABLE, &n->state);
+
+ while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+ msleep(1);
+
+ hrtimer_cancel(&n->timer);
+
+ clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
void netif_napi_del(struct napi_struct *napi)
{
list_del_init(&napi->dev_list);
@@ -4507,29 +4562,28 @@ static void net_rx_action(struct softirq_action *h)
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
+ LIST_HEAD(list);
+ LIST_HEAD(repoll);
void *have;
local_irq_disable();
+ list_splice_init(&sd->poll_list, &list);
+ local_irq_enable();
- while (!list_empty(&sd->poll_list)) {
+ while (!list_empty(&list)) {
struct napi_struct *n;
int work, weight;
- /* If softirq window is exhuasted then punt.
+ /* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break;
- local_irq_enable();
- /* Even though interrupts have been re-enabled, this
- * access is safe because interrupts can only add new
- * entries to the tail of this list, and only ->poll()
- * calls can remove this head entry from the list.
- */
- n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+ n = list_first_entry(&list, struct napi_struct, poll_list);
+ list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
@@ -4551,8 +4605,6 @@ static void net_rx_action(struct softirq_action *h)
budget -= work;
- local_irq_disable();
-
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
@@ -4560,32 +4612,40 @@ static void net_rx_action(struct softirq_action *h)
*/
if (unlikely(work == weight)) {
if (unlikely(napi_disable_pending(n))) {
- local_irq_enable();
napi_complete(n);
- local_irq_disable();
} else {
if (n->gro_list) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
- local_irq_enable();
napi_gro_flush(n, HZ >= 1000);
- local_irq_disable();
}
- list_move_tail(&n->poll_list, &sd->poll_list);
+ list_add_tail(&n->poll_list, &repoll);
}
}
netpoll_poll_unlock(have);
}
+
+ if (!sd_has_rps_ipi_waiting(sd) &&
+ list_empty(&list) &&
+ list_empty(&repoll))
+ return;
out:
+ local_irq_disable();
+
+ list_splice_tail_init(&sd->poll_list, &list);
+ list_splice_tail(&repoll, &list);
+ list_splice(&list, &sd->poll_list);
+ if (!list_empty(&sd->poll_list))
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
net_rps_action_and_irq_enable(sd);
return;
softnet_break:
sd->time_squeeze++;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}
@@ -5786,7 +5846,7 @@ EXPORT_SYMBOL(dev_change_carrier);
* Get device physical port ID
*/
int dev_get_phys_port_id(struct net_device *dev,
- struct netdev_phys_port_id *ppid)
+ struct netdev_phys_item_id *ppid)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -5865,6 +5925,8 @@ static void rollback_registered_many(struct list_head *head)
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
/* Shutdown queueing discipline. */
dev_shutdown(dev);
@@ -5874,6 +5936,11 @@ static void rollback_registered_many(struct list_head *head)
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+ GFP_KERNEL);
+
/*
* Flush the unicast and multicast chains
*/
@@ -5883,9 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index b6b230600b97..c0548d268e1a 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -278,8 +278,8 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
EXPORT_SYMBOL(__hw_addr_sync_dev);
/**
- * __hw_addr_unsync_dev - Remove synchonized addresses from device
- * @list: address list to remove syncronized addresses from
+ * __hw_addr_unsync_dev - Remove synchronized addresses from device
+ * @list: address list to remove synchronized addresses from
* @dev: device to sync
* @unsync: function to call if address should be removed
*
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 72e899a3efda..b94b1d293506 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -142,10 +142,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
case SIOCGIFHWADDR:
if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
+ memset(ifr->ifr_hwaddr.sa_data, 0,
+ sizeof(ifr->ifr_hwaddr.sa_data));
else
memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
ifr->ifr_hwaddr.sa_family = dev->type;
return 0;
@@ -265,7 +267,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
if (ifr->ifr_hwaddr.sa_family != dev->type)
return -EINVAL;
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return 0;
diff --git a/net/core/dst.c b/net/core/dst.c
index a028409ee438..e956ce6d1378 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
-/**
- * __skb_dst_set_noref - sets skb dst, without a reference
- * @skb: buffer
- * @dst: dst entry
- * @force: if force is set, use noref version even for DST_NOCACHE entries
- *
- * Sets skb dst, assuming a reference was not taken on dst
- * skb_dst_drop() should not dst_release() this dst
- */
-void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
-{
- WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
- /* If dst not in cache, we must take a reference, because
- * dst_release() will destroy dst as soon as its refcount becomes zero
- */
- if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
- dst_hold(dst);
- skb_dst_set(skb, dst);
- } else {
- skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
- }
-}
-EXPORT_SYMBOL(__skb_dst_set_noref);
-
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 06dfb293e5aa..550892cd6b3f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -25,6 +25,7 @@
#include <linux/slab.h>
#include <linux/rtnetlink.h>
#include <linux/sched.h>
+#include <linux/net.h>
/*
* Some useful ethtool_ops methods that're device independent.
@@ -84,7 +85,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
[NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
- [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
@@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_BUSY_POLL_BIT] = "busy-poll",
};
+static const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+ [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
+ [ETH_RSS_HASH_XOR_BIT] = "xor",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_FEATURES)
return ARRAY_SIZE(netdev_features_strings);
+ if (sset == ETH_SS_RSS_HASH_FUNCS)
+ return ARRAY_SIZE(rss_hash_func_strings);
+
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
else
@@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev,
if (stringset == ETH_SS_FEATURES)
memcpy(data, netdev_features_strings,
sizeof(netdev_features_strings));
+ else if (stringset == ETH_SS_RSS_HASH_FUNCS)
+ memcpy(data, rss_hash_func_strings,
+ sizeof(rss_hash_func_strings));
else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
@@ -574,6 +586,16 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
return 0;
}
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+void netdev_rss_key_fill(void *buffer, size_t len)
+{
+ BUG_ON(len > sizeof(netdev_rss_key));
+ net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+ memcpy(buffer, netdev_rss_key, len);
+}
+EXPORT_SYMBOL(netdev_rss_key_fill);
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -608,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
if (!indir)
return -ENOMEM;
- ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
if (ret)
goto out;
@@ -669,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
goto out;
}
- ret = ops->set_rxfh(dev, indir, NULL);
+ ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
out:
kfree(indir);
@@ -687,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
u32 total_size;
u32 indir_bytes;
u32 *indir = NULL;
+ u8 dev_hfunc = 0;
u8 *hkey = NULL;
u8 *rss_config;
- if (!(dev->ethtool_ops->get_rxfh_indir_size ||
- dev->ethtool_ops->get_rxfh_key_size) ||
- !dev->ethtool_ops->get_rxfh)
+ if (!ops->get_rxfh)
return -EOPNOTSUPP;
if (ops->get_rxfh_indir_size)
@@ -700,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (ops->get_rxfh_key_size)
dev_key_size = ops->get_rxfh_key_size(dev);
- if ((dev_key_size + dev_indir_size) == 0)
- return -EOPNOTSUPP;
-
if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
return -EFAULT;
user_indir_size = rxfh.indir_size;
user_key_size = rxfh.key_size;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+ rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
rxfh.indir_size = dev_indir_size;
@@ -717,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
return -EFAULT;
- /* If the user buffer size is 0, this is just a query for the
- * device table size and key size. Otherwise, if the User size is
- * not equal to device table size or key size it's an error.
- */
- if (!user_indir_size && !user_key_size)
- return 0;
-
if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
(user_key_size && (user_key_size != dev_key_size)))
return -EINVAL;
@@ -740,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (user_key_size)
hkey = rss_config + indir_bytes;
- ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
- if (!ret) {
- if (copy_to_user(useraddr +
- offsetof(struct ethtool_rxfh, rss_config[0]),
- rss_config, total_size))
- ret = -EFAULT;
- }
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+ if (ret)
+ goto out;
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
+ &dev_hfunc, sizeof(rxfh.hfunc))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size)) {
+ ret = -EFAULT;
+ }
+out:
kfree(rss_config);
return ret;
@@ -766,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
u8 *rss_config;
u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
- if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
- !ops->get_rxnfc || !ops->set_rxfh)
+ if (!ops->get_rxnfc || !ops->set_rxfh)
return -EOPNOTSUPP;
if (ops->get_rxfh_indir_size)
dev_indir_size = ops->get_rxfh_indir_size(dev);
if (ops->get_rxfh_key_size)
dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
- if ((dev_key_size + dev_indir_size) == 0)
- return -EOPNOTSUPP;
if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
return -EFAULT;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+ rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
- /* If either indir or hash key is valid, proceed further.
- * It is not valid to request that both be unchanged.
+ /* If either indir, hash key or function is valid, proceed further.
+ * Must request at least one change: indir size, hash key or function.
*/
if ((rxfh.indir_size &&
rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
rxfh.indir_size != dev_indir_size) ||
(rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
(rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
- rxfh.key_size == 0))
+ rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE))
return -EINVAL;
if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
@@ -835,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
}
}
- ret = ops->set_rxfh(dev, indir, hkey);
+ ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
out:
kfree(rss_config);
diff --git a/net/core/filter.c b/net/core/filter.c
index 647b12265e18..ec9baea10c16 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -44,6 +44,7 @@
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
+#include <linux/bpf.h>
/**
* sk_filter - run a packet through a socket filter
@@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
static void __bpf_prog_release(struct bpf_prog *prog)
{
- bpf_release_orig_filter(prog);
- bpf_prog_free(prog);
+ if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ } else {
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+ }
}
static void __sk_filter_release(struct sk_filter *fp)
@@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
+#ifdef CONFIG_BPF_SYSCALL
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+ struct bpf_prog *prog;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ prog = bpf_prog_get(ufd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ /* valid fd, but invalid program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp) {
+ bpf_prog_put(prog);
+ return -ENOMEM;
+ }
+ fp->prog = prog;
+
+ atomic_set(&fp->refcnt, 0);
+
+ if (!sk_filter_charge(sk, fp)) {
+ __sk_filter_release(fp);
+ return -ENOMEM;
+ }
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+
+ return 0;
+}
+
+/* allow socket filters to call
+ * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
+ */
+static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
+ return NULL;
+ }
+}
+
+static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* skb fields cannot be accessed yet */
+ return false;
+}
+
+static struct bpf_verifier_ops sock_filter_ops = {
+ .get_func_proto = sock_filter_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+ .ops = &sock_filter_ops,
+ .type = BPF_PROG_TYPE_SOCKET_FILTER,
+};
+
+static int __init register_sock_filter_ops(void)
+{
+ bpf_register_prog_type(&tl);
+ return 0;
+}
+late_initcall(register_sock_filter_ops);
+#else
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ return -EOPNOTSUPP;
+}
+#endif
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/iovec.c b/net/core/iovec.c
index e1ec45ab1e63..dcbe98b3726a 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -28,53 +28,6 @@
#include <net/sock.h>
/*
- * Verify iovec. The caller must ensure that the iovec is big enough
- * to hold the message iovec.
- *
- * Save time not doing access_ok. copy_*_user will make this work
- * in any case.
- */
-
-int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
-{
- int size, ct, err;
-
- if (m->msg_name && m->msg_namelen) {
- if (mode == VERIFY_READ) {
- void __user *namep;
- namep = (void __user __force *) m->msg_name;
- err = move_addr_to_kernel(namep, m->msg_namelen,
- address);
- if (err < 0)
- return err;
- }
- m->msg_name = address;
- } else {
- m->msg_name = NULL;
- m->msg_namelen = 0;
- }
-
- size = m->msg_iovlen * sizeof(struct iovec);
- if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
- return -EFAULT;
-
- m->msg_iov = iov;
- err = 0;
-
- for (ct = 0; ct < m->msg_iovlen; ct++) {
- size_t len = iov[ct].iov_len;
-
- if (len > INT_MAX - err) {
- len = INT_MAX - err;
- iov[ct].iov_len = len;
- }
- err += len;
- }
-
- return err;
-}
-
-/*
* And now for the all-in-one: copy and checksum from a user iovec
* directly to a datagram
* Calls to csum_partial but the last must be in 32 bit chunks
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index bd0767e6b2b3..49a9e3e06c08 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -21,7 +21,7 @@
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
-#include <asm/types.h>
+#include <linux/types.h>
enum lw_bits {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ef31fef25e5a..8e38f17288d3 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -56,7 +56,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags);
static void neigh_update_notify(struct neighbour *neigh);
static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
-static struct neigh_table *neigh_tables;
#ifdef CONFIG_PROC_FS
static const struct file_operations neigh_stat_seq_fops;
#endif
@@ -87,13 +86,8 @@ static const struct file_operations neigh_stat_seq_fops;
the most complicated procedure, which we allow is dev->hard_header.
It is supposed, that dev->hard_header is simplistic and does
not make callbacks to neighbour tables.
-
- The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
- list of neighbour tables. This list is used only in process context,
*/
-static DEFINE_RWLOCK(neigh_tbl_lock);
-
static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
kfree_skb(skb);
@@ -773,7 +767,7 @@ static void neigh_periodic_work(struct work_struct *work)
if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
tbl->last_rand = jiffies;
- for (p = &tbl->parms; p; p = p->next)
+ list_for_each_entry(p, &tbl->parms_list, list)
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
@@ -1446,7 +1440,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
{
struct neigh_parms *p;
- for (p = &tbl->parms; p; p = p->next) {
+ list_for_each_entry(p, &tbl->parms_list, list) {
if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
(!p->dev && !ifindex && net_eq(net, &init_net)))
return p;
@@ -1481,8 +1475,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
}
write_lock_bh(&tbl->lock);
- p->next = tbl->parms.next;
- tbl->parms.next = p;
+ list_add(&p->list, &tbl->parms.list);
write_unlock_bh(&tbl->lock);
neigh_parms_data_state_cleanall(p);
@@ -1501,24 +1494,15 @@ static void neigh_rcu_free_parms(struct rcu_head *head)
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
- struct neigh_parms **p;
-
if (!parms || parms == &tbl->parms)
return;
write_lock_bh(&tbl->lock);
- for (p = &tbl->parms.next; *p; p = &(*p)->next) {
- if (*p == parms) {
- *p = parms->next;
- parms->dead = 1;
- write_unlock_bh(&tbl->lock);
- if (parms->dev)
- dev_put(parms->dev);
- call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
- return;
- }
- }
+ list_del(&parms->list);
+ parms->dead = 1;
write_unlock_bh(&tbl->lock);
- neigh_dbg(1, "%s: not found\n", __func__);
+ if (parms->dev)
+ dev_put(parms->dev);
+ call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);
@@ -1530,11 +1514,15 @@ static void neigh_parms_destroy(struct neigh_parms *parms)
static struct lock_class_key neigh_table_proxy_queue_class;
-static void neigh_table_init_no_netlink(struct neigh_table *tbl)
+static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+
+void neigh_table_init(int index, struct neigh_table *tbl)
{
unsigned long now = jiffies;
unsigned long phsize;
+ INIT_LIST_HEAD(&tbl->parms_list);
+ list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
atomic_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
@@ -1574,34 +1562,14 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl)
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
-}
-
-void neigh_table_init(struct neigh_table *tbl)
-{
- struct neigh_table *tmp;
-
- neigh_table_init_no_netlink(tbl);
- write_lock(&neigh_tbl_lock);
- for (tmp = neigh_tables; tmp; tmp = tmp->next) {
- if (tmp->family == tbl->family)
- break;
- }
- tbl->next = neigh_tables;
- neigh_tables = tbl;
- write_unlock(&neigh_tbl_lock);
- if (unlikely(tmp)) {
- pr_err("Registering multiple tables for family %d\n",
- tbl->family);
- dump_stack();
- }
+ neigh_tables[index] = tbl;
}
EXPORT_SYMBOL(neigh_table_init);
-int neigh_table_clear(struct neigh_table *tbl)
+int neigh_table_clear(int index, struct neigh_table *tbl)
{
- struct neigh_table **tp;
-
+ neigh_tables[index] = NULL;
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
@@ -1609,14 +1577,6 @@ int neigh_table_clear(struct neigh_table *tbl)
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
- write_lock(&neigh_tbl_lock);
- for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
- if (*tp == tbl) {
- *tp = tbl->next;
- break;
- }
- }
- write_unlock(&neigh_tbl_lock);
call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
neigh_hash_free_rcu);
@@ -1634,12 +1594,32 @@ int neigh_table_clear(struct neigh_table *tbl)
}
EXPORT_SYMBOL(neigh_table_clear);
+static struct neigh_table *neigh_find_table(int family)
+{
+ struct neigh_table *tbl = NULL;
+
+ switch (family) {
+ case AF_INET:
+ tbl = neigh_tables[NEIGH_ARP_TABLE];
+ break;
+ case AF_INET6:
+ tbl = neigh_tables[NEIGH_ND_TABLE];
+ break;
+ case AF_DECnet:
+ tbl = neigh_tables[NEIGH_DN_TABLE];
+ break;
+ }
+
+ return tbl;
+}
+
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *dst_attr;
struct neigh_table *tbl;
+ struct neighbour *neigh;
struct net_device *dev = NULL;
int err = -EINVAL;
@@ -1660,39 +1640,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
}
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- struct neighbour *neigh;
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
- if (tbl->family != ndm->ndm_family)
- continue;
- read_unlock(&neigh_tbl_lock);
-
- if (nla_len(dst_attr) < tbl->key_len)
- goto out;
-
- if (ndm->ndm_flags & NTF_PROXY) {
- err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
- goto out;
- }
+ if (nla_len(dst_attr) < tbl->key_len)
+ goto out;
- if (dev == NULL)
- goto out;
+ if (ndm->ndm_flags & NTF_PROXY) {
+ err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+ goto out;
+ }
- neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
- if (neigh == NULL) {
- err = -ENOENT;
- goto out;
- }
+ if (dev == NULL)
+ goto out;
- err = neigh_update(neigh, NULL, NUD_FAILED,
- NEIGH_UPDATE_F_OVERRIDE |
- NEIGH_UPDATE_F_ADMIN);
- neigh_release(neigh);
+ neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ if (neigh == NULL) {
+ err = -ENOENT;
goto out;
}
- read_unlock(&neigh_tbl_lock);
- err = -EAFNOSUPPORT;
+
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
out:
return err;
@@ -1700,11 +1672,14 @@ out:
static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct neigh_table *tbl;
struct net_device *dev = NULL;
+ struct neighbour *neigh;
+ void *dst, *lladdr;
int err;
ASSERT_RTNL();
@@ -1728,70 +1703,60 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
goto out;
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
- struct neighbour *neigh;
- void *dst, *lladdr;
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
- if (tbl->family != ndm->ndm_family)
- continue;
- read_unlock(&neigh_tbl_lock);
+ if (nla_len(tb[NDA_DST]) < tbl->key_len)
+ goto out;
+ dst = nla_data(tb[NDA_DST]);
+ lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
- if (nla_len(tb[NDA_DST]) < tbl->key_len)
- goto out;
- dst = nla_data(tb[NDA_DST]);
- lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+ if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
- if (ndm->ndm_flags & NTF_PROXY) {
- struct pneigh_entry *pn;
+ err = -ENOBUFS;
+ pn = pneigh_lookup(tbl, net, dst, dev, 1);
+ if (pn) {
+ pn->flags = ndm->ndm_flags;
+ err = 0;
+ }
+ goto out;
+ }
- err = -ENOBUFS;
- pn = pneigh_lookup(tbl, net, dst, dev, 1);
- if (pn) {
- pn->flags = ndm->ndm_flags;
- err = 0;
- }
+ if (dev == NULL)
+ goto out;
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (neigh == NULL) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
goto out;
}
- if (dev == NULL)
+ neigh = __neigh_lookup_errno(tbl, dst, dev);
+ if (IS_ERR(neigh)) {
+ err = PTR_ERR(neigh);
+ goto out;
+ }
+ } else {
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ err = -EEXIST;
+ neigh_release(neigh);
goto out;
-
- neigh = neigh_lookup(tbl, dst, dev);
- if (neigh == NULL) {
- if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- err = -ENOENT;
- goto out;
- }
-
- neigh = __neigh_lookup_errno(tbl, dst, dev);
- if (IS_ERR(neigh)) {
- err = PTR_ERR(neigh);
- goto out;
- }
- } else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
- err = -EEXIST;
- neigh_release(neigh);
- goto out;
- }
-
- if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
- flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
- if (ndm->ndm_flags & NTF_USE) {
- neigh_event_send(neigh, NULL);
- err = 0;
- } else
- err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
- neigh_release(neigh);
- goto out;
+ if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+ flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
- read_unlock(&neigh_tbl_lock);
- err = -EAFNOSUPPORT;
+ if (ndm->ndm_flags & NTF_USE) {
+ neigh_event_send(neigh, NULL);
+ err = 0;
+ } else
+ err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+ neigh_release(neigh);
+
out:
return err;
}
@@ -1990,7 +1955,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
struct nlattr *tb[NDTA_MAX+1];
- int err;
+ bool found = false;
+ int err, tidx;
err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
nl_neightbl_policy);
@@ -2003,19 +1969,21 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
}
ndtmsg = nlmsg_data(nlh);
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
-
- if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0)
+ if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
+ found = true;
break;
+ }
}
- if (tbl == NULL) {
- err = -ENOENT;
- goto errout_locked;
- }
+ if (!found)
+ return -ENOENT;
/*
* We acquire tbl->lock to be nice to the periodic timers and
@@ -2126,8 +2094,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
errout_tbl_lock:
write_unlock_bh(&tbl->lock);
-errout_locked:
- read_unlock(&neigh_tbl_lock);
errout:
return err;
}
@@ -2142,10 +2108,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
+
if (tidx < tbl_skip || (family && tbl->family != family))
continue;
@@ -2154,7 +2123,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
NLM_F_MULTI) <= 0)
break;
- for (nidx = 0, p = tbl->parms.next; p; p = p->next) {
+ nidx = 0;
+ p = list_next_entry(&tbl->parms, list);
+ list_for_each_entry_from(p, &tbl->parms_list, list) {
if (!net_eq(neigh_parms_net(p), net))
continue;
@@ -2174,7 +2145,6 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
neigh_skip = 0;
}
out:
- read_unlock(&neigh_tbl_lock);
cb->args[0] = tidx;
cb->args[1] = nidx;
@@ -2357,7 +2327,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
int proxy = 0;
int err;
- read_lock(&neigh_tbl_lock);
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
/* check for full ndmsg structure presence, family member is
@@ -2369,8 +2338,11 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
s_t = cb->args[0];
- for (tbl = neigh_tables, t = 0; tbl;
- tbl = tbl->next, t++) {
+ for (t = 0; t < NEIGH_NR_TABLES; t++) {
+ tbl = neigh_tables[t];
+
+ if (!tbl)
+ continue;
if (t < s_t || (family && tbl->family != family))
continue;
if (t > s_t)
@@ -2383,7 +2355,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0)
break;
}
- read_unlock(&neigh_tbl_lock);
cb->args[0] = t;
return skb->len;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9dd06699b09c..999341244434 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -12,6 +12,7 @@
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
+#include <net/switchdev.h>
#include <linux/if_arp.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>
@@ -325,6 +326,23 @@ static ssize_t tx_queue_len_store(struct device *dev,
}
NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
+static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
+{
+ dev->gro_flush_timeout = val;
+ return 0;
+}
+
+static ssize_t gro_flush_timeout_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+}
+NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -387,7 +405,7 @@ static ssize_t phys_port_id_show(struct device *dev,
return restart_syscall();
if (dev_isalive(netdev)) {
- struct netdev_phys_port_id ppid;
+ struct netdev_phys_item_id ppid;
ret = dev_get_phys_port_id(netdev, &ppid);
if (!ret)
@@ -399,6 +417,28 @@ static ssize_t phys_port_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(phys_port_id);
+static ssize_t phys_switch_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct netdev_phys_item_id ppid;
+
+ ret = netdev_switch_parent_id_get(netdev, &ppid);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_switch_id);
+
static struct attribute *net_class_attrs[] = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -422,7 +462,9 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_mtu.attr,
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
+ &dev_attr_gro_flush_timeout.attr,
&dev_attr_phys_port_id.attr,
+ &dev_attr_phys_switch_id.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e6645b4f330a..e0ad5d16c9c5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -79,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
if (vlan_tx_tag_present(skb) &&
!vlan_hw_offload_capable(features, skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto,
- vlan_tx_tag_get(skb));
+ skb = __vlan_hwaccel_push_inside(skb);
if (unlikely(!skb)) {
/* This is actually a packet drop, but we
* don't want the code that calls this
@@ -88,7 +87,6 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
*/
goto out;
}
- skb->vlan_tci = 0;
}
status = netdev_start_xmit(skb, dev, txq, false);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 443256bdcddc..da934fc3faa8 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3728,8 +3728,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Remove proc before if_list entry, because add_device uses
* list to determine if interface already exist, avoid race
* with proc_create_data() */
- if (pkt_dev->entry)
- proc_remove(pkt_dev->entry);
+ proc_remove(pkt_dev->entry);
/* And update the thread if_list */
_rem_dev_from_if_list(t, pkt_dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 88e8de3b59b0..d06107d36ec8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -36,6 +36,7 @@
#include <linux/mutex.h>
#include <linux/if_addr.h>
#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
#include <linux/pci.h>
#include <linux/etherdevice.h>
@@ -43,6 +44,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
+#include <net/switchdev.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
@@ -868,7 +870,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
- + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -952,7 +955,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
{
int err;
- struct netdev_phys_port_id ppid;
+ struct netdev_phys_item_id ppid;
err = dev_get_phys_port_id(dev, &ppid);
if (err) {
@@ -967,6 +970,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_item_id psid;
+
+ err = netdev_switch_parent_id_get(dev, &psid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask)
@@ -1039,6 +1060,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (rtnl_phys_port_id_fill(skb, dev))
goto nla_put_failure;
+ if (rtnl_phys_switch_id_fill(skb, dev))
+ goto nla_put_failure;
+
attr = nla_reserve(skb, IFLA_STATS,
sizeof(struct rtnl_link_stats));
if (attr == NULL)
@@ -1196,8 +1220,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PROMISCUITY] = { .type = NLA_U32 },
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
- [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+ [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
+ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2221,8 +2246,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
- gfp_t flags)
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+ unsigned int change, gfp_t flags)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
@@ -2240,11 +2265,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
kfree_skb(skb);
goto errout;
}
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
- return;
+ return skb;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return NULL;
+}
+
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags)
+{
+ struct sk_buff *skb;
+
+ skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, flags);
}
EXPORT_SYMBOL(rtmsg_ifinfo);
@@ -2313,7 +2355,7 @@ errout:
int ndo_dflt_fdb_add(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
- const unsigned char *addr,
+ const unsigned char *addr, u16 vid,
u16 flags)
{
int err = -EINVAL;
@@ -2339,6 +2381,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
}
EXPORT_SYMBOL(ndo_dflt_fdb_add);
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+{
+ u16 vid = 0;
+
+ if (vlan_attr) {
+ if (nla_len(vlan_attr) != sizeof(u16)) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+ return -EINVAL;
+ }
+
+ vid = nla_get_u16(vlan_attr);
+
+ if (!vid || vid >= VLAN_VID_MASK) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
+ vid);
+ return -EINVAL;
+ }
+ }
+ *p_vid = vid;
+ return 0;
+}
+
static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
@@ -2346,6 +2410,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *tb[NDA_MAX+1];
struct net_device *dev;
u8 *addr;
+ u16 vid;
int err;
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
@@ -2371,6 +2436,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
addr = nla_data(tb[NDA_LLADDR]);
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ if (err)
+ return err;
+
err = -EOPNOTSUPP;
/* Support fdb on master device the net/bridge default case */
@@ -2379,7 +2448,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
- err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags);
+ err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags);
if (err)
goto out;
else
@@ -2390,9 +2460,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
if ((ndm->ndm_flags & NTF_SELF)) {
if (dev->netdev_ops->ndo_fdb_add)
err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ vid,
nlh->nlmsg_flags);
else
- err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
nlh->nlmsg_flags);
if (!err) {
@@ -2410,7 +2481,7 @@ out:
int ndo_dflt_fdb_del(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
- const unsigned char *addr)
+ const unsigned char *addr, u16 vid)
{
int err = -EINVAL;
@@ -2439,6 +2510,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net_device *dev;
int err = -EINVAL;
__u8 *addr;
+ u16 vid;
if (!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
@@ -2466,6 +2538,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
addr = nla_data(tb[NDA_LLADDR]);
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ if (err)
+ return err;
+
err = -EOPNOTSUPP;
/* Support fdb on master device the net/bridge default case */
@@ -2475,7 +2551,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
const struct net_device_ops *ops = br_dev->netdev_ops;
if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr);
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
if (err)
goto out;
@@ -2486,9 +2562,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
/* Embedded bridge, macvlan, and any other device support */
if (ndm->ndm_flags & NTF_SELF) {
if (dev->netdev_ops->ndo_fdb_del)
- err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+ err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
+ vid);
else
- err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
if (!err) {
rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
@@ -2628,12 +2705,22 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
+static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
+ unsigned int attrnum, unsigned int flag)
+{
+ if (mask & flag)
+ return nla_put_u8(skb, attrnum, !!(flags & flag));
+ return 0;
+}
+
int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
- struct net_device *dev, u16 mode)
+ struct net_device *dev, u16 mode,
+ u32 flags, u32 mask)
{
struct nlmsghdr *nlh;
struct ifinfomsg *ifm;
struct nlattr *br_afspec;
+ struct nlattr *protinfo;
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
@@ -2665,13 +2752,46 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
if (!br_afspec)
goto nla_put_failure;
- if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) ||
- nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
nla_nest_cancel(skb, br_afspec);
goto nla_put_failure;
}
+
+ if (mode != BRIDGE_MODE_UNDEF) {
+ if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
nla_nest_end(skb, br_afspec);
+ protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+ if (!protinfo)
+ goto nla_put_failure;
+
+ if (brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_FAST_LEAVE,
+ BR_MULTICAST_FAST_LEAVE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING, BR_LEARNING) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ nla_nest_cancel(skb, protinfo);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, protinfo);
+
return nlmsg_end(skb, nlh);
nla_put_failure:
nlmsg_cancel(skb, nlh);
diff --git a/net/core/scm.c b/net/core/scm.c
index b442e7e25e60..3b6899b7d810 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
struct cmsghdr *cmsg;
int err;
- for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
- {
+ for_each_cmsghdr(cmsg, msg) {
err = -EINVAL;
/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 32e31c299631..ae13ef6b3ea7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->fclone = SKB_FCLONE_ORIG;
atomic_set(&fclones->fclone_ref, 1);
- fclones->skb2.fclone = SKB_FCLONE_FREE;
+ fclones->skb2.fclone = SKB_FCLONE_CLONE;
fclones->skb2.pfmemalloc = pfmemalloc;
}
out:
@@ -336,59 +336,85 @@ struct netdev_alloc_cache {
unsigned int pagecnt_bias;
};
static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
+ gfp_t gfp_mask)
{
- struct netdev_alloc_cache *nc;
- void *data = NULL;
- int order;
- unsigned long flags;
+ const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
+ struct page *page = NULL;
+ gfp_t gfp = gfp_mask;
+
+ if (order) {
+ gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+ page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+ nc->frag.size = PAGE_SIZE << (page ? order : 0);
+ }
- local_irq_save(flags);
- nc = this_cpu_ptr(&netdev_alloc_cache);
- if (unlikely(!nc->frag.page)) {
+ if (unlikely(!page))
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+ nc->frag.page = page;
+
+ return page;
+}
+
+static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
+ unsigned int fragsz, gfp_t gfp_mask)
+{
+ struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
+ struct page *page = nc->frag.page;
+ unsigned int size;
+ int offset;
+
+ if (unlikely(!page)) {
refill:
- for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
- gfp_t gfp = gfp_mask;
+ page = __page_frag_refill(nc, gfp_mask);
+ if (!page)
+ return NULL;
+
+ /* if size can vary use frag.size else just use PAGE_SIZE */
+ size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
- if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN;
- nc->frag.page = alloc_pages(gfp, order);
- if (likely(nc->frag.page))
- break;
- if (--order < 0)
- goto end;
- }
- nc->frag.size = PAGE_SIZE << order;
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1,
- &nc->frag.page->_count);
- nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
- nc->frag.offset = 0;
+ atomic_add(size - 1, &page->_count);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ nc->frag.offset = size;
}
- if (nc->frag.offset + fragsz > nc->frag.size) {
- if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) {
- if (!atomic_sub_and_test(nc->pagecnt_bias,
- &nc->frag.page->_count))
- goto refill;
- /* OK, page count is 0, we can safely set it */
- atomic_set(&nc->frag.page->_count,
- NETDEV_PAGECNT_MAX_BIAS);
- } else {
- atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias,
- &nc->frag.page->_count);
- }
- nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
- nc->frag.offset = 0;
+ offset = nc->frag.offset - fragsz;
+ if (unlikely(offset < 0)) {
+ if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ goto refill;
+
+ /* if size can vary use frag.size else just use PAGE_SIZE */
+ size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
+
+ /* OK, page count is 0, we can safely set it */
+ atomic_set(&page->_count, size);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ offset = size - fragsz;
}
- data = page_address(nc->frag.page) + nc->frag.offset;
- nc->frag.offset += fragsz;
nc->pagecnt_bias--;
-end:
+ nc->frag.offset = offset;
+
+ return page_address(page) + offset;
+}
+
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ unsigned long flags;
+ void *data;
+
+ local_irq_save(flags);
+ data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
}
@@ -406,11 +432,25 @@ void *netdev_alloc_frag(unsigned int fragsz)
}
EXPORT_SYMBOL(netdev_alloc_frag);
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+}
+
+void *napi_alloc_frag(unsigned int fragsz)
+{
+ return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(napi_alloc_frag);
+
/**
- * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
- * @dev: network device to receive on
+ * __alloc_rx_skb - allocate an skbuff for rx
* @length: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
+ * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case we have to fallback to __alloc_skb()
+ * If SKB_ALLOC_NAPI is set, page fragment will be allocated
+ * from napi_cache instead of netdev_cache.
*
* Allocate a new &sk_buff and assign it a usage count of one. The
* buffer has unspecified headroom built in. Users should allocate
@@ -419,11 +459,11 @@ EXPORT_SYMBOL(netdev_alloc_frag);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
- unsigned int length, gfp_t gfp_mask)
+static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
+ int flags)
{
struct sk_buff *skb = NULL;
- unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
+ unsigned int fragsz = SKB_DATA_ALIGN(length) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
@@ -432,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __netdev_alloc_frag(fragsz, gfp_mask);
+ data = (flags & SKB_ALLOC_NAPI) ?
+ __napi_alloc_frag(fragsz, gfp_mask) :
+ __netdev_alloc_frag(fragsz, gfp_mask);
if (likely(data)) {
skb = build_skb(data, fragsz);
@@ -440,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
put_page(virt_to_head_page(data));
}
} else {
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ skb = __alloc_skb(length, gfp_mask,
SKB_ALLOC_RX, NUMA_NO_NODE);
}
+ return skb;
+}
+
+/**
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has NET_SKB_PAD headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+ unsigned int length, gfp_t gfp_mask)
+{
+ struct sk_buff *skb;
+
+ length += NET_SKB_PAD;
+ skb = __alloc_rx_skb(length, gfp_mask, 0);
+
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
}
+
return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);
+/**
+ * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * @napi: napi instance this buffer was allocated for
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
+ *
+ * Allocate a new sk_buff for use in NAPI receive. This buffer will
+ * attempt to allocate the head from a special reserved region used
+ * only for NAPI Rx allocation. By doing this we can save several
+ * CPU cycles by avoiding having to disable and re-enable IRQs.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
+ unsigned int length, gfp_t gfp_mask)
+{
+ struct sk_buff *skb;
+
+ length += NET_SKB_PAD + NET_IP_ALIGN;
+ skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ skb->dev = napi->dev;
+ }
+
+ return skb;
+}
+EXPORT_SYMBOL(__napi_alloc_skb);
+
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize)
{
@@ -541,26 +638,27 @@ static void kfree_skbmem(struct sk_buff *skb)
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
kmem_cache_free(skbuff_head_cache, skb);
- break;
+ return;
case SKB_FCLONE_ORIG:
fclones = container_of(skb, struct sk_buff_fclones, skb1);
- if (atomic_dec_and_test(&fclones->fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, fclones);
- break;
- case SKB_FCLONE_CLONE:
- fclones = container_of(skb, struct sk_buff_fclones, skb2);
-
- /* The clone portion is available for
- * fast-cloning again.
+ /* We usually free the clone (TX completion) before original skb
+ * This test would have no chance to be true for the clone,
+ * while here, branch prediction will be good.
*/
- skb->fclone = SKB_FCLONE_FREE;
+ if (atomic_read(&fclones->fclone_ref) == 1)
+ goto fastpath;
+ break;
- if (atomic_dec_and_test(&fclones->fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, fclones);
+ default: /* SKB_FCLONE_CLONE */
+ fclones = container_of(skb, struct sk_buff_fclones, skb2);
break;
}
+ if (!atomic_dec_and_test(&fclones->fclone_ref))
+ return;
+fastpath:
+ kmem_cache_free(skbuff_fclone_cache, fclones);
}
static void skb_release_head_state(struct sk_buff *skb)
@@ -872,15 +970,15 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
struct sk_buff_fclones *fclones = container_of(skb,
struct sk_buff_fclones,
skb1);
- struct sk_buff *n = &fclones->skb2;
+ struct sk_buff *n;
if (skb_orphan_frags(skb, gfp_mask))
return NULL;
if (skb->fclone == SKB_FCLONE_ORIG &&
- n->fclone == SKB_FCLONE_FREE) {
- n->fclone = SKB_FCLONE_CLONE;
- atomic_inc(&fclones->fclone_ref);
+ atomic_read(&fclones->fclone_ref) == 1) {
+ n = &fclones->skb2;
+ atomic_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -3002,7 +3100,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (nskb->len == len + doffset)
goto perform_csum_check;
- if (!sg) {
+ if (!sg && !nskb->remcsum_offload) {
nskb->ip_summed = CHECKSUM_NONE;
nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb, len),
@@ -3074,7 +3172,7 @@ skip_fraglist:
nskb->truesize += nskb->data_len;
perform_csum_check:
- if (!csum) {
+ if (!csum && !nskb->remcsum_offload) {
nskb->csum = skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
nskb->ip_summed = CHECKSUM_NONE;
@@ -3088,6 +3186,16 @@ perform_csum_check:
* (see validate_xmit_skb_list() for example)
*/
segs->prev = tail;
+
+ /* Following permits correct backpressure, for protocols
+ * using skb_set_owner_w().
+ * Idea is to tranfert ownership from head_skb to last segment.
+ */
+ if (head_skb->destructor == sock_wfree) {
+ swap(tail->truesize, head_skb->truesize);
+ swap(tail->destructor, head_skb->destructor);
+ swap(tail->sk, head_skb->sk);
+ }
return segs;
err:
@@ -4130,6 +4238,113 @@ err_free:
}
EXPORT_SYMBOL(skb_vlan_untag);
+int skb_ensure_writable(struct sk_buff *skb, int write_len)
+{
+ if (!pskb_may_pull(skb, write_len))
+ return -ENOMEM;
+
+ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
+ return 0;
+
+ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable);
+
+/* remove VLAN header from packet and update csum accordingly. */
+static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+{
+ struct vlan_hdr *vhdr;
+ unsigned int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ __skb_push(skb, offset);
+ err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
+ if (unlikely(err))
+ goto pull;
+
+ skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+
+ vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+ *vlan_tci = ntohs(vhdr->h_vlan_TCI);
+
+ memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+ __skb_pull(skb, VLAN_HLEN);
+
+ vlan_set_encap_proto(skb, vhdr);
+ skb->mac_header += VLAN_HLEN;
+
+ if (skb_network_offset(skb) < ETH_HLEN)
+ skb_set_network_header(skb, ETH_HLEN);
+
+ skb_reset_mac_len(skb);
+pull:
+ __skb_pull(skb, offset);
+
+ return err;
+}
+
+int skb_vlan_pop(struct sk_buff *skb)
+{
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ int err;
+
+ if (likely(vlan_tx_tag_present(skb))) {
+ skb->vlan_tci = 0;
+ } else {
+ if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
+ skb->protocol != htons(ETH_P_8021AD)) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (err)
+ return err;
+ }
+ /* move next vlan tag to hw accel tag */
+ if (likely((skb->protocol != htons(ETH_P_8021Q) &&
+ skb->protocol != htons(ETH_P_8021AD)) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ vlan_proto = skb->protocol;
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (unlikely(err))
+ return err;
+
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_pop);
+
+int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+{
+ if (vlan_tx_tag_present(skb)) {
+ unsigned int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ /* __vlan_insert_tag expect skb->data pointing to mac header.
+ * So change skb->data before calling it and change back to
+ * original position later
+ */
+ __skb_push(skb, offset);
+ err = __vlan_insert_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (err)
+ return err;
+ skb->protocol = skb->vlan_proto;
+ skb->mac_len += VLAN_HLEN;
+ __skb_pull(skb, offset);
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(skb->data
+ + (2 * ETH_ALEN), VLAN_HLEN, 0));
+ }
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_push);
+
/**
* alloc_skb_with_frags - allocate skb with page frags
*
diff --git a/net/core/sock.c b/net/core/sock.c
index 15e0c67b1069..9a56b2000c3f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -888,6 +888,19 @@ set_rcvbuf:
}
break;
+ case SO_ATTACH_BPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_attach_bpf(ufd, sk);
+ }
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -1213,6 +1226,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_max_pacing_rate;
break;
+ case SO_INCOMING_CPU:
+ v.val = sk->sk_incoming_cpu;
+ break;
+
default:
return -ENOPROTOOPT;
}
@@ -1517,6 +1534,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_err = 0;
newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
@@ -2457,7 +2475,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err)
goto out_free_skb;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cf9cd13509a7..31baba2a71ce 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,8 @@ static int zero = 0;
static int one = 1;
static int ushort_max = USHRT_MAX;
+static int net_msg_warn; /* Unused, but still a sysctl */
+
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -215,6 +217,18 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
+static int proc_do_rss_key(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table fake_table;
+ char buf[NETDEV_RSS_KEY_LEN * 3];
+
+ snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
+ fake_table.data = buf;
+ fake_table.maxlen = sizeof(buf);
+ return proc_dostring(&fake_table, write, buffer, lenp, ppos);
+}
+
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
@@ -263,6 +277,13 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "netdev_rss_key",
+ .data = &netdev_rss_key,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_do_rss_key,
+ },
#ifdef CONFIG_BPF_JIT
{
.procname = "bpf_jit_enable",
diff --git a/net/core/utils.c b/net/core/utils.c
index efc76dd9dcd1..7b803884c162 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -33,9 +33,6 @@
#include <asm/byteorder.h>
#include <asm/uaccess.h>
-int net_msg_warn __read_mostly = 1;
-EXPORT_SYMBOL(net_msg_warn);
-
DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
/*
* All net warning printk()s should be guarded by this function.