From eda297729171fe16bf34fe5b0419dfb69060f623 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 19 Jul 2013 19:40:10 +0200 Subject: tun: Support software transmit time stamping. This patch adds transmit time stamping to the tun/tap driver. Similar support already exists for UDP, can, and raw packets. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/tun.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index db690a372260..a72d141047cb 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -739,6 +739,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) >= dev->tx_queue_len / tun->numqueues) goto drop; + if (skb->sk) { + sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); + sw_tx_timestamp(skb); + } + /* Orphan the skb - required as we might hang on to it * for indefinite time. */ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) @@ -1476,7 +1481,6 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, return ret; } - static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len, int flags) @@ -1488,10 +1492,15 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, if (!tun) return -EBADFD; - if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out; } + if (flags & MSG_ERRQUEUE) { + ret = sock_recv_errqueue(sock->sk, m, total_len, + SOL_PACKET, TUN_TX_TIMESTAMP); + goto out; + } ret = tun_do_read(tun, tfile, iocb, m->msg_iov, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { @@ -2274,6 +2283,7 @@ static const struct ethtool_ops tun_ethtool_ops = { .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, }; -- cgit v1.2.3-59-g8ed1b From 6680ec68eff47d36f67b4351bc9836fd6cba9532 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 25 Jul 2013 13:00:33 +0800 Subject: tuntap: hardware vlan tx support Inspired by commit f09e2249c4f5c7c13261ec73f5a7807076af0c8e (macvtap: restore vlan header on user read). This patch adds hardware vlan tx support for tuntap. This is done by copying vlan header directly into userspace in tun_put_user() instead of doing it through __vlan_put_tag() in dev_hard_start_xmit(). This eliminates one unnecessary memmove() in vlan_insert_tag() for 802.1ad and 802.1q traffic. pktgen test shows about 20% improvement for 802.1q traffic: Before: 662149pps 317Mb/sec (317831520bps) errors: 0 After: 801033pps 384Mb/sec (384495840bps) errors: 0 Cc: Basil Gor Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a72d141047cb..5dce262f538e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -1265,6 +1266,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, { struct tun_pi pi = { 0, skb->protocol }; ssize_t total = 0; + int vlan_offset = 0; if (!(tun->flags & TUN_NO_PI)) { if ((len -= sizeof(pi)) < 0) @@ -1328,11 +1330,40 @@ static ssize_t tun_put_user(struct tun_struct *tun, total += tun->vnet_hdr_sz; } - len = min_t(int, skb->len, len); + if (!vlan_tx_tag_present(skb)) { + len = min_t(int, skb->len, len); + } else { + int copy, ret; + struct { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + } veth; + + veth.h_vlan_proto = skb->vlan_proto; + veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); + + vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); + len = min_t(int, skb->len + VLAN_HLEN, len); + + copy = min_t(int, vlan_offset, len); + ret = skb_copy_datagram_const_iovec(skb, 0, iv, total, copy); + len -= copy; + total += copy; + if (ret || !len) + goto done; + + copy = min_t(int, sizeof(veth), len); + ret = memcpy_toiovecend(iv, (void *)&veth, total, copy); + len -= copy; + total += copy; + if (ret || !len) + goto done; + } - skb_copy_datagram_const_iovec(skb, 0, iv, total, len); - total += skb->len; + skb_copy_datagram_const_iovec(skb, vlan_offset, iv, total, len); + total += len; +done: tun->dev->stats.tx_packets++; tun->dev->stats.tx_bytes += len; @@ -1691,7 +1722,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun_flow_init(tun); dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | - TUN_USER_FEATURES; + TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; dev->features = dev->hw_features; dev->vlan_features = dev->features; -- cgit v1.2.3-59-g8ed1b From b4bf07771faaf959b0a916d35b1b930c030e30a8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:03 +0800 Subject: net: move iov_pages() to net/core/iovec.c To let it be reused and reduce code duplication. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 23 ----------------------- drivers/net/tun.c | 23 ----------------------- include/linux/socket.h | 2 ++ net/core/iovec.c | 24 ++++++++++++++++++++++++ 4 files changed, 26 insertions(+), 46 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index a98fb0ed6aef..dfec20df17ba 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -698,29 +698,6 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, return 0; } -static unsigned long iov_pages(const struct iovec *iv, int offset, - unsigned long nr_segs) -{ - unsigned long seg, base; - int pages = 0, len, size; - - while (nr_segs && (offset >= iv->iov_len)) { - offset -= iv->iov_len; - ++iv; - --nr_segs; - } - - for (seg = 0; seg < nr_segs; seg++) { - base = (unsigned long)iv[seg].iov_base + offset; - len = iv[seg].iov_len - offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - pages += size; - offset = 0; - } - - return pages; -} - /* Get packet from user space buffer */ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, const struct iovec *iv, unsigned long total_len, diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 5dce262f538e..18527ef0cc75 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1041,29 +1041,6 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, return 0; } -static unsigned long iov_pages(const struct iovec *iv, int offset, - unsigned long nr_segs) -{ - unsigned long seg, base; - int pages = 0, len, size; - - while (nr_segs && (offset >= iv->iov_len)) { - offset -= iv->iov_len; - ++iv; - --nr_segs; - } - - for (seg = 0; seg < nr_segs; seg++) { - base = (unsigned long)iv[seg].iov_base + offset; - len = iv[seg].iov_len - offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - pages += size; - offset = 0; - } - - return pages; -} - /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, const struct iovec *iv, diff --git a/include/linux/socket.h b/include/linux/socket.h index 230c04bda3e2..445ef7519dc2 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -313,6 +313,8 @@ extern int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, unsigned int len, __wsum *csump); +extern unsigned long iov_pages(const struct iovec *iov, int offset, + unsigned long nr_segs); extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode); extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata, diff --git a/net/core/iovec.c b/net/core/iovec.c index de178e462682..b77eeecc0011 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -212,3 +212,27 @@ out_fault: goto out; } EXPORT_SYMBOL(csum_partial_copy_fromiovecend); + +unsigned long iov_pages(const struct iovec *iov, int offset, + unsigned long nr_segs) +{ + unsigned long seg, base; + int pages = 0, len, size; + + while (nr_segs && (offset >= iov->iov_len)) { + offset -= iov->iov_len; + ++iov; + --nr_segs; + } + + for (seg = 0; seg < nr_segs; seg++) { + base = (unsigned long)iov[seg].iov_base + offset; + len = iov[seg].iov_len - offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + pages += size; + offset = 0; + } + + return pages; +} +EXPORT_SYMBOL(iov_pages); -- cgit v1.2.3-59-g8ed1b From c3bdeb5c7cc073ccf5ff9624642022a8613a956e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:04 +0800 Subject: net: move zerocopy_sg_from_iovec() to net/core/datagram.c To let it be reused and reduce code duplication. Also document this function. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 80 ------------------------------------------- drivers/net/tun.c | 80 ------------------------------------------- include/linux/skbuff.h | 4 +++ net/core/datagram.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 160 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index dfec20df17ba..182364abfa35 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -536,86 +536,6 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, return skb; } -/* set skb frags from iovec, this can move to core network code for reuse */ -static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, - int offset, size_t count) -{ - int len = iov_length(from, count) - offset; - int copy = skb_headlen(skb); - int size, offset1 = 0; - int i = 0; - - /* Skip over from offset */ - while (count && (offset >= from->iov_len)) { - offset -= from->iov_len; - ++from; - --count; - } - - /* copy up to skb headlen */ - while (count && (copy > 0)) { - size = min_t(unsigned int, copy, from->iov_len - offset); - if (copy_from_user(skb->data + offset1, from->iov_base + offset, - size)) - return -EFAULT; - if (copy > size) { - ++from; - --count; - offset = 0; - } else - offset += size; - copy -= size; - offset1 += size; - } - - if (len == offset1) - return 0; - - while (count--) { - struct page *page[MAX_SKB_FRAGS]; - int num_pages; - unsigned long base; - unsigned long truesize; - - len = from->iov_len - offset; - if (!len) { - offset = 0; - ++from; - continue; - } - base = (unsigned long)from->iov_base + offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - if (i + size > MAX_SKB_FRAGS) - return -EMSGSIZE; - num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if (num_pages != size) { - int j; - - for (j = 0; j < num_pages; j++) - put_page(page[i + j]); - return -EFAULT; - } - truesize = size * PAGE_SIZE; - skb->data_len += len; - skb->len += len; - skb->truesize += truesize; - atomic_add(truesize, &skb->sk->sk_wmem_alloc); - while (len) { - int off = base & ~PAGE_MASK; - int size = min_t(int, len, PAGE_SIZE - off); - __skb_fill_page_desc(skb, i, page[i], off, size); - skb_shinfo(skb)->nr_frags++; - /* increase sk_wmem_alloc */ - base += size; - len -= size; - i++; - } - offset = 0; - ++from; - } - return 0; -} - /* * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should * be shared with the tun/tap driver. diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 18527ef0cc75..b1630479d4a1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -961,86 +961,6 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, return skb; } -/* set skb frags from iovec, this can move to core network code for reuse */ -static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, - int offset, size_t count) -{ - int len = iov_length(from, count) - offset; - int copy = skb_headlen(skb); - int size, offset1 = 0; - int i = 0; - - /* Skip over from offset */ - while (count && (offset >= from->iov_len)) { - offset -= from->iov_len; - ++from; - --count; - } - - /* copy up to skb headlen */ - while (count && (copy > 0)) { - size = min_t(unsigned int, copy, from->iov_len - offset); - if (copy_from_user(skb->data + offset1, from->iov_base + offset, - size)) - return -EFAULT; - if (copy > size) { - ++from; - --count; - offset = 0; - } else - offset += size; - copy -= size; - offset1 += size; - } - - if (len == offset1) - return 0; - - while (count--) { - struct page *page[MAX_SKB_FRAGS]; - int num_pages; - unsigned long base; - unsigned long truesize; - - len = from->iov_len - offset; - if (!len) { - offset = 0; - ++from; - continue; - } - base = (unsigned long)from->iov_base + offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - if (i + size > MAX_SKB_FRAGS) - return -EMSGSIZE; - num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if (num_pages != size) { - int j; - - for (j = 0; j < num_pages; j++) - put_page(page[i + j]); - return -EFAULT; - } - truesize = size * PAGE_SIZE; - skb->data_len += len; - skb->len += len; - skb->truesize += truesize; - atomic_add(truesize, &skb->sk->sk_wmem_alloc); - while (len) { - int off = base & ~PAGE_MASK; - int size = min_t(int, len, PAGE_SIZE - off); - __skb_fill_page_desc(skb, i, page[i], off, size); - skb_shinfo(skb)->nr_frags++; - /* increase sk_wmem_alloc */ - base += size; - len -= size; - i++; - } - offset = 0; - ++from; - } - return 0; -} - /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, const struct iovec *iv, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ae46475d4568..5ac96f31d546 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2359,6 +2359,10 @@ extern int skb_copy_datagram_from_iovec(struct sk_buff *skb, const struct iovec *from, int from_offset, int len); +extern int zerocopy_sg_from_iovec(struct sk_buff *skb, + const struct iovec *frm, + int offset, + size_t count); extern int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, const struct iovec *to, diff --git a/net/core/datagram.c b/net/core/datagram.c index 8ab48cd89559..f987faef21ce 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -573,6 +573,99 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +/** + * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * @skb: buffer to copy + * @from: io vector to copy to + * @offset: offset in the io vector to start copying from + * @count: amount of vectors to copy to buffer from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + * Note: the iovec is not modified during the copy + */ +int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + int offset, size_t count) +{ + int len = iov_length(from, count) - offset; + int copy = skb_headlen(skb); + int size, offset1 = 0; + int i = 0; + + /* Skip over from offset */ + while (count && (offset >= from->iov_len)) { + offset -= from->iov_len; + ++from; + --count; + } + + /* copy up to skb headlen */ + while (count && (copy > 0)) { + size = min_t(unsigned int, copy, from->iov_len - offset); + if (copy_from_user(skb->data + offset1, from->iov_base + offset, + size)) + return -EFAULT; + if (copy > size) { + ++from; + --count; + offset = 0; + } else + offset += size; + copy -= size; + offset1 += size; + } + + if (len == offset1) + return 0; + + while (count--) { + struct page *page[MAX_SKB_FRAGS]; + int num_pages; + unsigned long base; + unsigned long truesize; + + len = from->iov_len - offset; + if (!len) { + offset = 0; + ++from; + continue; + } + base = (unsigned long)from->iov_base + offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; + num_pages = get_user_pages_fast(base, size, 0, &page[i]); + if (num_pages != size) { + int j; + + for (j = 0; j < num_pages; j++) + put_page(page[i + j]); + return -EFAULT; + } + truesize = size * PAGE_SIZE; + skb->data_len += len; + skb->len += len; + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (len) { + int off = base & ~PAGE_MASK; + int size = min_t(int, len, PAGE_SIZE - off); + __skb_fill_page_desc(skb, i, page[i], off, size); + skb_shinfo(skb)->nr_frags++; + /* increase sk_wmem_alloc */ + base += size; + len -= size; + i++; + } + offset = 0; + ++from; + } + return 0; +} +EXPORT_SYMBOL(zerocopy_sg_from_iovec); + static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 __user *to, int len, __wsum *csump) -- cgit v1.2.3-59-g8ed1b From 28d6427109d13b0f447cba5761f88d3548e83605 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2013 14:38:47 -0700 Subject: net: attempt high order allocations in sock_alloc_send_pskb() Adding paged frags skbs to af_unix sockets introduced a performance regression on large sends because of additional page allocations, even if each skb could carry at least 100% more payload than before. We can instruct sock_alloc_send_pskb() to attempt high order allocations. Most of the time, it does a single page allocation instead of 8. I added an additional parameter to sock_alloc_send_pskb() to let other users to opt-in for this new feature on followup patches. Tested: Before patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 46861.15 After patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 57981.11 Signed-off-by: Eric Dumazet Cc: David Rientjes Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 2 +- drivers/net/tun.c | 2 +- include/net/sock.h | 3 +- net/core/sock.c | 100 +++++++++++++++++++++++++------------------------ net/packet/af_packet.c | 2 +- net/unix/af_unix.c | 6 ++- 6 files changed, 60 insertions(+), 55 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 182364abfa35..8f6056d9ba97 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -524,7 +524,7 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err); + err, 0); if (!skb) return NULL; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b1630479d4a1..978d8654b14a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -949,7 +949,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - &err); + &err, 0); if (!skb) return ERR_PTR(err); diff --git a/include/net/sock.h b/include/net/sock.h index ab6a8b75207e..e4bbcbfd07ea 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1539,7 +1539,8 @@ extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode); + int *errcode, + int max_page_order); extern void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); diff --git a/net/core/sock.c b/net/core/sock.c index 83667de45ec9..5b6beba494a3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1741,24 +1741,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; err = -EMSGSIZE; if (npages > MAX_SKB_FRAGS) goto failure; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; - timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -1767,50 +1766,52 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | __GFP_NOWARN, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1819,6 +1820,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } @@ -1827,7 +1829,7 @@ EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4cb28a7f639b..6c53dd9f5ccc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2181,7 +2181,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err); + err, 0); if (!skb) return NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 99dc760cdd95..fee9e3397cd1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1479,7 +1479,8 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, MAX_SKB_FRAGS * PAGE_SIZE); skb = sock_alloc_send_pskb(sk, len - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err, + PAGE_ALLOC_COSTLY_ORDER); if (skb == NULL) goto out; @@ -1651,7 +1652,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err, + get_order(UNIX_SKB_FRAGS_SZ)); if (!skb) goto out_err; -- cgit v1.2.3-59-g8ed1b From fb7589a162162223e6bb6422dde3fb1ce07d9a78 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:31:38 +0400 Subject: tun: Add ability to create tun device with given index Tun devices cannot be created with ifidex user wants, but it's required by checkpoint-restore project. Long time ago such ability was implemented for rtnl_ops-based interface for creating links (9c7dafbf net: Allow to create links with given ifindex), but the only API for creating and managing tuntap devices is ioctl-based and is evolving with adding new ones (cde8b15f tuntap: add ioctl to attach or detach a file form tuntap device). Following that trend, here's how a new ioctl that sets the ifindex for device, that _will_ be created by TUNSETIFF ioctl looks like. So those who want a tuntap device with the ifindex N, should open the tun device, call ioctl(fd, TUNSETIFINDEX, &N), then call TUNSETIFF. If the index N is busy, then the register_netdev will find this out and the ioctl would be failed with -EBUSY. If setifindex is not called, then it will be generated as before. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 21 ++++++++++++++++++++- include/uapi/linux/if_tun.h | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7ed13cc0dcb2..4b65fbcc490f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -138,7 +138,10 @@ struct tun_file { struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; - u16 queue_index; + union { + u16 queue_index; + unsigned int ifindex; + }; struct list_head next; struct tun_struct *detached; }; @@ -1601,6 +1604,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; + dev->ifindex = tfile->ifindex; tun = netdev_priv(dev); tun->dev = dev; @@ -1817,6 +1821,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, kgid_t group; int sndbuf; int vnet_hdr_sz; + unsigned int ifindex; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) { @@ -1851,6 +1856,19 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; goto unlock; } + if (cmd == TUNSETIFINDEX) { + ret = -EPERM; + if (tun) + goto unlock; + + ret = -EFAULT; + if (copy_from_user(&ifindex, argp, sizeof(ifindex))) + goto unlock; + + ret = 0; + tfile->ifindex = ifindex; + goto unlock; + } ret = -EBADFD; if (!tun) @@ -2099,6 +2117,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) rcu_assign_pointer(tfile->tun, NULL); tfile->net = get_net(current->nsproxy->net_ns); tfile->flags = 0; + tfile->ifindex = 0; rcu_assign_pointer(tfile->socket.wq, &tfile->wq); init_waitqueue_head(&tfile->wq.wait); diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 1870ee29bb37..c58d023c4822 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -56,6 +56,7 @@ #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETIFINDEX _IOW('T', 218, unsigned int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-59-g8ed1b From 3d407a80b62fc5891b41fe9045f23aba4437fc33 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:32:00 +0400 Subject: tun: Report whether the queue is attached or not Multiqueue tun devices allow to attach and detach from its queues while keeping the interface itself set on file. Knowing this is critical for the checkpoint part of criu project. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 4b65fbcc490f..db43a2409733 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1881,6 +1881,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNGETIFF: tun_get_iff(current->nsproxy->net_ns, tun, &ifr); + if (tfile->detached) + ifr.ifr_flags |= IFF_DETACH_QUEUE; + if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; -- cgit v1.2.3-59-g8ed1b From 849c9b6f93cc4cb5eb59301b6380a7a81b43f414 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:32:21 +0400 Subject: tun: Allow to skip filter on attach There's a small problem with sk-filters on tun devices. Consider an application doing this sequence of steps: fd = open("/dev/net/tun"); ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" }); ioctl(fd, TUNATTACHFILTER, &my_filter); ioctl(fd, TUNSETPERSIST, 1); close(fd); At that point the tun0 will remain in the system and will keep in mind that there should be a socket filter at address '&my_filter'. If after that we do fd = open("/dev/net/tun"); ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" }); we most likely receive the -EFAULT error, since tun_attach() would try to connect the filter back. But (!) if we provide a filter at address &my_filter, then tun0 will be created and the "new" filter would be attached, but application may not know about that. This may create certain problems to anyone using tun-s, but it's critical problem for c/r -- if we meet a persistent tun device with a filter in mind, we will not be able to attach to it to dump its state (flags, owner, address, vnethdr size, etc.). The proposal is to allow to attach to tun device (with TUNSETIFF) w/o attaching the filter to the tun-file's socket. After this attach app may e.g clean the device by dropping the filter, it doesn't want to have one, or (in case of c/r) get information about the device with tun ioctls. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 12 +++++++----- include/uapi/linux/if_tun.h | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index db43a2409733..6acbdbccebcd 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -501,7 +501,7 @@ static void tun_detach_all(struct net_device *dev) module_put(THIS_MODULE); } -static int tun_attach(struct tun_struct *tun, struct file *file) +static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) { struct tun_file *tfile = file->private_data; int err; @@ -526,7 +526,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file) err = 0; /* Re-attach the filter to presist device */ - if (tun->filter_attached == true) { + if (!skip_filter && (tun->filter_attached == true)) { err = sk_attach_filter(&tun->fprog, tfile->socket.sk); if (!err) goto out; @@ -1557,7 +1557,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - err = tun_attach(tun, file); + err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); if (err < 0) return err; @@ -1631,7 +1631,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev->vlan_features = dev->features; INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file); + err = tun_attach(tun, file, false); if (err < 0) goto err_free_dev; @@ -1795,7 +1795,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; - ret = tun_attach(tun, file); + ret = tun_attach(tun, file, false); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached) @@ -1883,6 +1883,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; + if (!tfile->socket.sk->sk_filter) + ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index c58d023c4822..cc127b2b4c3d 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -71,6 +71,7 @@ #define IFF_DETACH_QUEUE 0x0400 /* read-only flag */ #define IFF_PERSIST 0x0800 +#define IFF_NOFILTER 0x1000 /* Socket options */ #define TUN_TX_TIMESTAMP 1 -- cgit v1.2.3-59-g8ed1b From 76975e9cb4a7c6fe39478a3dc4dd292a5c6c8c74 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:32:39 +0400 Subject: tun: Get skfilter layout The only thing we may have from tun device is the fprog, whic contains the number of filter elements and a pointer to (user-space) memory where the elements are. The program itself may not be available if the device is persistent and detached. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 ++++++++++ include/uapi/linux/if_tun.h | 1 + 2 files changed, 11 insertions(+) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 6acbdbccebcd..60a1e93e9d35 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2042,6 +2042,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_detach_filter(tun, tun->numqueues); break; + case TUNGETFILTER: + ret = -EINVAL; + if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + break; + ret = -EFAULT; + if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) + break; + ret = 0; + break; + default: ret = -EINVAL; break; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index cc127b2b4c3d..e9502dd1ee2c 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -57,6 +57,7 @@ #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) #define TUNSETIFINDEX _IOW('T', 218, unsigned int) +#define TUNGETFILTER _IOR('T', 219, struct sock_fprog) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-59-g8ed1b From 4bfb0513ff203a700f5d17b97b772e8c171549bc Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 5 Sep 2013 17:53:59 +0800 Subject: tuntap: purge socket error queue on detach Commit eda297729171fe16bf34fe5b0419dfb69060f623 (tun: Support software transmit time stamping) will queue skbs into error queue when tx stamping is enabled. But it forgets to purge the error queue during detach. This patch fixes this. Cc: Richard Cochran Acked-by: Richard Cochran Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 60a1e93e9d35..2dddb1bc82c9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -409,6 +409,12 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) return tun; } +static void tun_queue_purge(struct tun_file *tfile) +{ + skb_queue_purge(&tfile->sk.sk_receive_queue); + skb_queue_purge(&tfile->sk.sk_error_queue); +} + static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; @@ -435,7 +441,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ - skb_queue_purge(&tfile->sk.sk_receive_queue); + tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); @@ -487,12 +493,12 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); /* Drop read queue */ - skb_queue_purge(&tfile->sk.sk_receive_queue); + tun_queue_purge(tfile); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); - skb_queue_purge(&tfile->sk.sk_receive_queue); + tun_queue_purge(tfile); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); -- cgit v1.2.3-59-g8ed1b From 7bf6630523a4fddcc3e37bc37dadbe0cf2362354 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 5 Sep 2013 17:54:00 +0800 Subject: tuntap: orphan frags before trying to set tx timestamp sock_tx_timestamp() will clear all zerocopy flags of skb which may lead the frags never to be orphaned. This will break guest to guest traffic when zerocopy is enabled. Fix this by orphaning the frags before trying to set tx time stamp. The issue were introduced by commit eda297729171fe16bf34fe5b0419dfb69060f623 (tun: Support software transmit time stamping). Cc: Richard Cochran Cc: Sergei Shtylyov Acked-by: Richard Cochran Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2dddb1bc82c9..a639de8401f8 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -749,15 +749,17 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) >= dev->tx_queue_len / tun->numqueues) goto drop; + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + goto drop; + if (skb->sk) { sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); sw_tx_timestamp(skb); } /* Orphan the skb - required as we might hang on to it - * for indefinite time. */ - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) - goto drop; + * for indefinite time. + */ skb_orphan(skb); nf_reset(skb); -- cgit v1.2.3-59-g8ed1b From 662ca437e714caaab855b12415d6ffd815985bc0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 11 Sep 2013 18:09:48 +0800 Subject: tuntap: correctly handle error in tun_set_iff() Commit c8d68e6be1c3b242f1c598595830890b65cea64a (tuntap: multiqueue support) only call free_netdev() on error in tun_set_iff(). This causes several issues: - memory of tun security were leaked - use after free since the flow gc timer was not deleted and the tfile were not detached This patch solves the above issues. Reported-by: Wannes Rombouts Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/tun.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a639de8401f8..807815fc9968 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1641,11 +1641,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, file, false); if (err < 0) - goto err_free_dev; + goto err_free_flow; err = register_netdevice(tun->dev); if (err < 0) - goto err_free_dev; + goto err_detach; if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) || device_create_file(&tun->dev->dev, &dev_attr_owner) || @@ -1689,7 +1689,12 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) strcpy(ifr->ifr_name, tun->dev->name); return 0; - err_free_dev: +err_detach: + tun_detach_all(dev); +err_free_flow: + tun_flow_uninit(tun); + security_tun_dev_free_security(tun->security); +err_free_dev: free_netdev(dev); return err; } -- cgit v1.2.3-59-g8ed1b