From d1361840f8c519eaee9a78ffe09e4f0a1b586846 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:35 -0700 Subject: tcp: fix SO_RCVLOWAT and RCVBUF autotuning Applications might use SO_RCVLOWAT on TCP socket hoping to receive one [E]POLLIN event only when a given amount of bytes are ready in socket receive queue. Problem is that receive autotuning is not aware of this constraint, meaning sk_rcvbuf might be too small to allow all bytes to be stored. Add a new (struct proto_ops)->set_rcvlowat method so that a protocol can override the default setsockopt(SO_RCVLOWAT) behavior. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaed0367e669..f5c562aaef35 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1006,6 +1006,7 @@ const struct proto_ops inet_stream_ops = { .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif + .set_rcvlowat = tcp_set_rcvlowat, }; EXPORT_SYMBOL(inet_stream_ops); -- cgit v1.2.3-59-g8ed1b From 93ab6cc69162775201587cc9da00d5016dc890e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:38 -0700 Subject: tcp: implement mmap() for zero copy receive Some networks can make sure TCP payload can exactly fit 4KB pages, with well chosen MSS/MTU and architectures. Implement mmap() system call so that applications can avoid copying data without complex splice() games. Note that a successful mmap( X bytes) on TCP socket is consuming bytes, as if recvmsg() has been done. (tp->copied += X) Only PROT_READ mappings are accepted, as skb page frags are fundamentally shared and read only. If tcp_mmap() finds data that is not a full page, or a patch of urgent data, -EINVAL is returned, no bytes are consumed. Application must fallback to recvmsg() to read the problematic sequence. mmap() wont block, regardless of socket being in blocking or non-blocking mode. If not enough bytes are in receive queue, mmap() would return -EAGAIN, or -EIO if socket is in a state where no other bytes can be added into receive queue. An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD) to efficiently use mmap() On the sender side, MSG_EOR might help to clearly separate unaligned headers and 4K-aligned chunks if necessary. Tested: mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch. MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header) Without mmap() (tcp_mmap -s) received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit, cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit, cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit, cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit, cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches With mmap() on receiver (tcp_mmap -s -z) received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit, cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit, cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit, cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit, cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 + net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/af_inet6.c | 2 +- 4 files changed, 117 insertions(+), 2 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ee85c47c185..833154e3df17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f5c562aaef35..3ebf599cebae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = { .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, - .mmap = sock_no_mmap, + .mmap = tcp_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c768d306b657..438fbca96cd3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +/* When user wants to mmap X pages, we first need to perform the mapping + * before freeing any skbs in receive queue, otherwise user would be unable + * to fallback to standard recvmsg(). This happens if some data in the + * requested block is not exactly fitting in a page. + * + * We only support order-0 pages for the moment. + * mmap() on TCP is very strict, there is no point + * trying to accommodate with pathological layouts. + */ +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + unsigned int nr_pages = size >> PAGE_SHIFT; + struct page **pages_array = NULL; + u32 seq, len, offset, nr = 0; + struct sock *sk = sock->sk; + const skb_frag_t *frags; + struct tcp_sock *tp; + struct sk_buff *skb; + int ret; + + if (vma->vm_pgoff || !nr_pages) + return -EINVAL; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* TODO: Maybe the following is not needed if pages are COW */ + vma->vm_flags &= ~VM_MAYWRITE; + + lock_sock(sk); + + ret = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + sock_rps_record_flow(sk); + + if (tcp_inq(sk) < size) { + ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN; + goto out; + } + tp = tcp_sk(sk); + seq = tp->copied_seq; + /* Abort if urgent data is in the area */ + if (unlikely(tp->urg_data)) { + u32 urg_offset = tp->urg_seq - seq; + + ret = -EINVAL; + if (urg_offset < size) + goto out; + } + ret = -ENOMEM; + pages_array = kvmalloc_array(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!pages_array) + goto out; + skb = tcp_recv_skb(sk, seq, &offset); + ret = -EINVAL; +skb_start: + /* We do not support anything not in page frags */ + offset -= skb_headlen(skb); + if ((int)offset < 0) + goto out; + if (skb_has_frag_list(skb)) + goto out; + len = skb->data_len - offset; + frags = skb_shinfo(skb)->frags; + while (offset) { + if (frags->size > offset) + goto out; + offset -= frags->size; + frags++; + } + while (nr < nr_pages) { + if (len) { + if (len < PAGE_SIZE) + goto out; + if (frags->size != PAGE_SIZE || frags->page_offset) + goto out; + pages_array[nr++] = skb_frag_page(frags); + frags++; + len -= PAGE_SIZE; + seq += PAGE_SIZE; + continue; + } + skb = skb->next; + offset = seq - TCP_SKB_CB(skb)->seq; + goto skb_start; + } + /* OK, we have a full set of pages ready to be inserted into vma */ + for (nr = 0; nr < nr_pages; nr++) { + ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT), + pages_array[nr]); + if (ret) + goto out; + } + /* operation is complete, we can 'consume' all skbs */ + tp->copied_seq = seq; + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_recv_skb(sk, seq, &offset); + tcp_cleanup_rbuf(sk, size); + + ret = 0; +out: + release_sock(sk); + kvfree(pages_array); + return ret; +} +EXPORT_SYMBOL(tcp_mmap); + static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e70d59fb26e1..2c694912df2e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = inet_recvmsg, /* ok */ - .mmap = sock_no_mmap, + .mmap = tcp_mmap, .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, -- cgit v1.2.3-59-g8ed1b From 05255b823a6173525587f29c4e8f1ca33fd7677d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Apr 2018 08:58:08 -0700 Subject: tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive When adding tcp mmap() implementation, I forgot that socket lock had to be taken before current->mm->mmap_sem. syzbot eventually caught the bug. Since we can not lock the socket in tcp mmap() handler we have to split the operation in two phases. 1) mmap() on a tcp socket simply reserves VMA space, and nothing else. This operation does not involve any TCP locking. 2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements the transfert of pages from skbs to one VMA. This operation only uses down_read(¤t->mm->mmap_sem) after holding TCP lock, thus solving the lockdep issue. This new implementation was suggested by Andy Lutomirski with great details. Benefits are : - Better scalability, in case multiple threads reuse VMAS (without mmap()/munmap() calls) since mmap_sem wont be write locked. - Better error recovery. The previous mmap() model had to provide the expected size of the mapping. If for some reason one part could not be mapped (partial MSS), the whole operation had to be aborted. With the tcp_zerocopy_receive struct, kernel can report how many bytes were successfuly mapped, and how many bytes should be read to skip the problematic sequence. - No more memory allocation to hold an array of page pointers. 16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/ - skbs are freed while mmap_sem has been released Following patch makes the change in tcp_mmap tool to demonstrate one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...) Note that memcg might require additional changes. Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive") Signed-off-by: Eric Dumazet Reported-by: syzbot Suggested-by: Andy Lutomirski Cc: linux-mm@kvack.org Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 8 ++ net/ipv4/af_inet.c | 2 + net/ipv4/tcp.c | 194 +++++++++++++++++++++++++---------------------- net/ipv6/af_inet6.c | 2 + 4 files changed, 116 insertions(+), 90 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 379b08700a54..e9e8373b34b9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -122,6 +122,7 @@ enum { #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ +#define TCP_ZEROCOPY_RECEIVE 35 struct tcp_repair_opt { __u32 opt_code; @@ -276,4 +277,11 @@ struct tcp_diag_md5sig { __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; }; +/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ + +struct tcp_zerocopy_receive { + __u64 address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ +}; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3ebf599cebae..b403499fdabe 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = { .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, +#ifdef CONFIG_MMU .mmap = tcp_mmap, +#endif .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dfd090ea54ad..4028ddd14dd5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1726,118 +1726,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); -/* When user wants to mmap X pages, we first need to perform the mapping - * before freeing any skbs in receive queue, otherwise user would be unable - * to fallback to standard recvmsg(). This happens if some data in the - * requested block is not exactly fitting in a page. - * - * We only support order-0 pages for the moment. - * mmap() on TCP is very strict, there is no point - * trying to accommodate with pathological layouts. - */ +#ifdef CONFIG_MMU +static const struct vm_operations_struct tcp_vm_ops = { +}; + int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { - unsigned long size = vma->vm_end - vma->vm_start; - unsigned int nr_pages = size >> PAGE_SHIFT; - struct page **pages_array = NULL; - u32 seq, len, offset, nr = 0; - struct sock *sk = sock->sk; - const skb_frag_t *frags; + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + + /* Instruct vm_insert_page() to not down_read(mmap_sem) */ + vma->vm_flags |= VM_MIXEDMAP; + + vma->vm_ops = &tcp_vm_ops; + return 0; +} +EXPORT_SYMBOL(tcp_mmap); + +static int tcp_zerocopy_receive(struct sock *sk, + struct tcp_zerocopy_receive *zc) +{ + unsigned long address = (unsigned long)zc->address; + const skb_frag_t *frags = NULL; + u32 length = 0, seq, offset; + struct vm_area_struct *vma; + struct sk_buff *skb = NULL; struct tcp_sock *tp; - struct sk_buff *skb; int ret; - if (vma->vm_pgoff || !nr_pages) + if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; - if (vma->vm_flags & VM_WRITE) - return -EPERM; - /* TODO: Maybe the following is not needed if pages are COW */ - vma->vm_flags &= ~VM_MAYWRITE; - - lock_sock(sk); - - ret = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) - goto out; + return -ENOTCONN; sock_rps_record_flow(sk); - if (tcp_inq(sk) < size) { - ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN; + down_read(¤t->mm->mmap_sem); + + ret = -EINVAL; + vma = find_vma(current->mm, address); + if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) goto out; - } + zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); + tp = tcp_sk(sk); seq = tp->copied_seq; - /* Abort if urgent data is in the area */ - if (unlikely(tp->urg_data)) { - u32 urg_offset = tp->urg_seq - seq; + zc->length = min_t(u32, zc->length, tcp_inq(sk)); + zc->length &= ~(PAGE_SIZE - 1); - ret = -EINVAL; - if (urg_offset < size) - goto out; - } - ret = -ENOMEM; - pages_array = kvmalloc_array(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!pages_array) - goto out; - skb = tcp_recv_skb(sk, seq, &offset); - ret = -EINVAL; -skb_start: - /* We do not support anything not in page frags */ - offset -= skb_headlen(skb); - if ((int)offset < 0) - goto out; - if (skb_has_frag_list(skb)) - goto out; - len = skb->data_len - offset; - frags = skb_shinfo(skb)->frags; - while (offset) { - if (frags->size > offset) - goto out; - offset -= frags->size; - frags++; - } - while (nr < nr_pages) { - if (len) { - if (len < PAGE_SIZE) - goto out; - if (frags->size != PAGE_SIZE || frags->page_offset) - goto out; - pages_array[nr++] = skb_frag_page(frags); - frags++; - len -= PAGE_SIZE; - seq += PAGE_SIZE; - continue; + zap_page_range(vma, address, zc->length); + + zc->recv_skip_hint = 0; + ret = 0; + while (length + PAGE_SIZE <= zc->length) { + if (zc->recv_skip_hint < PAGE_SIZE) { + if (skb) { + skb = skb->next; + offset = seq - TCP_SKB_CB(skb)->seq; + } else { + skb = tcp_recv_skb(sk, seq, &offset); + } + + zc->recv_skip_hint = skb->len - offset; + offset -= skb_headlen(skb); + if ((int)offset < 0 || skb_has_frag_list(skb)) + break; + frags = skb_shinfo(skb)->frags; + while (offset) { + if (frags->size > offset) + goto out; + offset -= frags->size; + frags++; + } } - skb = skb->next; - offset = seq - TCP_SKB_CB(skb)->seq; - goto skb_start; - } - /* OK, we have a full set of pages ready to be inserted into vma */ - for (nr = 0; nr < nr_pages; nr++) { - ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT), - pages_array[nr]); + if (frags->size != PAGE_SIZE || frags->page_offset) + break; + ret = vm_insert_page(vma, address + length, + skb_frag_page(frags)); if (ret) - goto out; + break; + length += PAGE_SIZE; + seq += PAGE_SIZE; + zc->recv_skip_hint -= PAGE_SIZE; + frags++; } - /* operation is complete, we can 'consume' all skbs */ - tp->copied_seq = seq; - tcp_rcv_space_adjust(sk); - - /* Clean up data we have read: This will do ACK frames. */ - tcp_recv_skb(sk, seq, &offset); - tcp_cleanup_rbuf(sk, size); - - ret = 0; out: - release_sock(sk); - kvfree(pages_array); + up_read(¤t->mm->mmap_sem); + if (length) { + tp->copied_seq = seq; + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_recv_skb(sk, seq, &offset); + tcp_cleanup_rbuf(sk, length); + ret = 0; + if (length == zc->length) + zc->recv_skip_hint = 0; + } else { + if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) + ret = -EIO; + } + zc->length = length; return ret; } -EXPORT_SYMBOL(tcp_mmap); +#endif static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) @@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } return 0; } +#ifdef CONFIG_MMU + case TCP_ZEROCOPY_RECEIVE: { + struct tcp_zerocopy_receive zc; + int err; + + if (get_user(len, optlen)) + return -EFAULT; + if (len != sizeof(zc)) + return -EINVAL; + if (copy_from_user(&zc, optval, len)) + return -EFAULT; + lock_sock(sk); + err = tcp_zerocopy_receive(sk, &zc); + release_sock(sk); + if (!err && copy_to_user(optval, &zc, len)) + err = -EFAULT; + return err; + } +#endif default: return -ENOPROTOOPT; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 36d622c477b1..d0af96e0d109 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = inet_recvmsg, /* ok */ +#ifdef CONFIG_MMU .mmap = tcp_mmap, +#endif .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, -- cgit v1.2.3-59-g8ed1b