diff options
author | David S. Miller <davem@davemloft.net> | 2018-02-27 14:19:11 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-02-27 14:19:11 -0500 |
commit | f4155eff1f27076467826cf9bf77723277ead2ec (patch) | |
tree | 718bf38615de51b487304c660e2c0f4952409885 | |
parent | Merge branch '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue (diff) | |
parent | selftests/net: reap zerocopy completions passed up as ancillary data. (diff) | |
download | linux-dev-f4155eff1f27076467826cf9bf77723277ead2ec.tar.xz linux-dev-f4155eff1f27076467826cf9bf77723277ead2ec.zip |
Merge branch 'RDS-optimized-notification-for-zerocopy-completion'
Sowmini Varadhan says:
====================
RDS: optimized notification for zerocopy completion
Resending with acked-by additions: previous attempt does not show
up in Patchwork. This time with a new mail Message-Id.
RDS applications use predominantly request-response, transacation
based IPC, so that ingress and egress traffic are well-balanced,
and it is possible/desirable to reduce system-call overhead by
piggybacking the notifications for zerocopy completion response
with data.
Moreover, it has been pointed out that socket functions block
if sk_err is non-zero, thus if the RDS code does not plan/need
to use sk_error_queue path for completion notification, it
is preferable to remove the sk_errror_queue related paths in
RDS.
Both of these goals are implemented in this series.
v2: removed sk_error_queue support
v3: incorporated additional code review comments (details in each patch)
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/uapi/linux/errqueue.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/rds.h | 7 | ||||
-rw-r--r-- | net/rds/af_rds.c | 7 | ||||
-rw-r--r-- | net/rds/message.c | 38 | ||||
-rw-r--r-- | net/rds/rds.h | 2 | ||||
-rw-r--r-- | net/rds/recv.c | 31 | ||||
-rw-r--r-- | tools/testing/selftests/net/msg_zerocopy.c | 120 |
7 files changed, 111 insertions, 96 deletions
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 28812eda4209..dc64cfaf13da 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,13 +20,11 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 -#define SO_EE_ORIGIN_ZCOOKIE 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 -#define SO_EE_ORIGIN_MAX_ZCOOKIES 8 /** * struct scm_timestamping - timestamps exposed through cmsg diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 12e3bca32cad..a66b213de3d7 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -104,6 +104,7 @@ #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 #define RDS_CMSG_RXPATH_LATENCY 11 #define RDS_CMSG_ZCOPY_COOKIE 12 +#define RDS_CMSG_ZCOPY_COMPLETION 13 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 @@ -317,6 +318,12 @@ struct rds_rdma_notify { #define RDS_RDMA_DROPPED 3 #define RDS_RDMA_OTHER_ERROR 4 +#define RDS_MAX_ZCOOKIES 8 +struct rds_zcopy_cookies { + __u32 num; + __u32 cookies[RDS_MAX_ZCOOKIES]; +}; + /* * Common set of flags for all RDMA related structs */ diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index a937f18896ae..f7126108a811 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -77,6 +77,7 @@ static int rds_release(struct socket *sock) rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); + __skb_queue_purge(&rs->rs_zcookie_queue); spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); @@ -144,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, * - to signal that a previously congested destination may have become * uncongested * - A notification has been queued to the socket (this can be a congestion - * update, or a RDMA completion). + * update, or a RDMA completion, or a MSG_ZEROCOPY completion). * * EPOLLOUT is asserted if there is room on the send queue. This does not mean * however, that the next sendmsg() call will succeed. If the application tries @@ -178,7 +179,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, spin_unlock(&rs->rs_lock); } if (!list_empty(&rs->rs_recv_queue) || - !list_empty(&rs->rs_notify_queue)) + !list_empty(&rs->rs_notify_queue) || + !skb_queue_empty(&rs->rs_zcookie_queue)) mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); @@ -513,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_recv_queue); INIT_LIST_HEAD(&rs->rs_notify_queue); INIT_LIST_HEAD(&rs->rs_cong_list); + skb_queue_head_init(&rs->rs_zcookie_queue); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; diff --git a/net/rds/message.c b/net/rds/message.c index 651834513481..116cf87ccb89 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -58,32 +58,26 @@ EXPORT_SYMBOL_GPL(rds_message_addref); static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) { - struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); - int ncookies; - u32 *ptr; + struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb; + int ncookies = ck->num; - if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE) + if (ncookies == RDS_MAX_ZCOOKIES) return false; - ncookies = serr->ee.ee_data; - if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES) - return false; - ptr = skb_put(skb, sizeof(u32)); - *ptr = cookie; - serr->ee.ee_data = ++ncookies; + ck->cookies[ncookies] = cookie; + ck->num = ++ncookies; return true; } static void rds_rm_zerocopy_callback(struct rds_sock *rs, struct rds_znotifier *znotif) { - struct sock *sk = rds_rs_to_sk(rs); struct sk_buff *skb, *tail; - struct sock_exterr_skb *serr; unsigned long flags; struct sk_buff_head *q; u32 cookie = znotif->z_cookie; + struct rds_zcopy_cookies *ck; - q = &sk->sk_error_queue; + q = &rs->rs_zcookie_queue; spin_lock_irqsave(&q->lock, flags); tail = skb_peek_tail(q); @@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_unlock_irqrestore(&q->lock, flags); mm_unaccount_pinned_pages(&znotif->z_mmp); consume_skb(rds_skb_from_znotifier(znotif)); - sk->sk_error_report(sk); + /* caller invokes rds_wake_sk_sleep() */ return; } skb = rds_skb_from_znotifier(znotif); - serr = SKB_EXT_ERR(skb); - memset(&serr->ee, 0, sizeof(serr->ee)); - serr->ee.ee_errno = 0; - serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE; - serr->ee.ee_info = 0; + ck = (struct rds_zcopy_cookies *)skb->cb; + memset(ck, 0, sizeof(*ck)); WARN_ON(!skb_zcookie_add(skb, cookie)); __skb_queue_tail(q, skb); spin_unlock_irqrestore(&q->lock, flags); - sk->sk_error_report(sk); + /* caller invokes rds_wake_sk_sleep() */ mm_unaccount_pinned_pages(&znotif->z_mmp); } @@ -129,6 +120,7 @@ static void rds_message_purge(struct rds_message *rm) if (rm->data.op_mmp_znotifier) { zcopy = true; rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rds_wake_sk_sleep(rs); rm->data.op_mmp_znotifier = NULL; } sock_put(rds_rs_to_sk(rs)); @@ -362,10 +354,12 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, int total_copied = 0; struct sk_buff *skb; - skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32), - GFP_KERNEL); + skb = alloc_skb(0, GFP_KERNEL); if (!skb) return -ENOMEM; + BUILD_BUG_ON(sizeof(skb->cb) < + max_t(int, sizeof(struct rds_znotifier), + sizeof(struct rds_zcopy_cookies))); rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, length)) { diff --git a/net/rds/rds.h b/net/rds/rds.h index 31cd38852050..33b16353d8f3 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -603,6 +603,8 @@ struct rds_sock { /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; + + struct sk_buff_head rs_zcookie_queue; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) diff --git a/net/rds/recv.c b/net/rds/recv.c index b080961464df..d50747725221 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -577,6 +577,32 @@ out: return ret; } +static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) +{ + struct sk_buff *skb; + struct sk_buff_head *q = &rs->rs_zcookie_queue; + struct rds_zcopy_cookies *done; + + if (!msg->msg_control) + return false; + + if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || + msg->msg_controllen < CMSG_SPACE(sizeof(*done))) + return false; + + skb = skb_dequeue(q); + if (!skb) + return false; + done = (struct rds_zcopy_cookies *)skb->cb; + if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), + done)) { + skb_queue_head(q, skb); + return false; + } + consume_skb(skb); + return true; +} + int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { @@ -611,7 +637,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (!rds_next_incoming(rs, &inc)) { if (nonblock) { - ret = -EAGAIN; + bool reaped = rds_recvmsg_zcookie(rs, msg); + + ret = reaped ? 0 : -EAGAIN; break; } @@ -660,6 +688,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, ret = -EFAULT; goto out; } + rds_recvmsg_zcookie(rs, msg); rds_stats_inc(s_recv_delivered); diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index 5cc2a53bb71c..406cc70c571d 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -344,27 +344,53 @@ static int do_setup_tx(int domain, int type, int protocol) return fd; } -static int do_process_zerocopy_cookies(struct sock_extended_err *serr, - uint32_t *ckbuf, size_t nbytes) +static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck) { - int ncookies, i; + int i; - if (serr->ee_errno != 0) - error(1, 0, "serr: wrong error code: %u", serr->ee_errno); - ncookies = serr->ee_data; - if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES) + if (ck->num > RDS_MAX_ZCOOKIES) error(1, 0, "Returned %d cookies, max expected %d\n", - ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES); - if (nbytes != ncookies * sizeof(uint32_t)) - error(1, 0, "Expected %d cookies, got %ld\n", - ncookies, nbytes/sizeof(uint32_t)); - for (i = 0; i < ncookies; i++) + ck->num, RDS_MAX_ZCOOKIES); + for (i = 0; i < ck->num; i++) if (cfg_verbose >= 2) - fprintf(stderr, "%d\n", ckbuf[i]); - return ncookies; + fprintf(stderr, "%d\n", ck->cookies[i]); + return ck->num; } -static bool do_recv_completion(int fd) +static bool do_recvmsg_completion(int fd) +{ + char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))]; + struct rds_zcopy_cookies *ck; + struct cmsghdr *cmsg; + struct msghdr msg; + bool ret = false; + + memset(&msg, 0, sizeof(msg)); + msg.msg_control = cmsgbuf; + msg.msg_controllen = sizeof(cmsgbuf); + + if (recvmsg(fd, &msg, MSG_DONTWAIT)) + return ret; + + if (msg.msg_flags & MSG_CTRUNC) + error(1, errno, "recvmsg notification: truncated"); + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_RDS && + cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) { + + ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg); + completions += do_process_zerocopy_cookies(ck); + ret = true; + break; + } + error(0, 0, "ignoring cmsg at level %d type %d\n", + cmsg->cmsg_level, cmsg->cmsg_type); + } + return ret; +} + +static bool do_recv_completion(int fd, int domain) { struct sock_extended_err *serr; struct msghdr msg = {}; @@ -372,17 +398,13 @@ static bool do_recv_completion(int fd) uint32_t hi, lo, range; int ret, zerocopy; char control[100]; - uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES]; - struct iovec iov; + + if (domain == PF_RDS) + return do_recvmsg_completion(fd); msg.msg_control = control; msg.msg_controllen = sizeof(control); - iov.iov_base = ckbuf; - iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0])); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - ret = recvmsg(fd, &msg, MSG_ERRQUEUE); if (ret == -1 && errno == EAGAIN) return false; @@ -402,10 +424,6 @@ static bool do_recv_completion(int fd) serr = (void *) CMSG_DATA(cm); - if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) { - completions += do_process_zerocopy_cookies(serr, ckbuf, ret); - return true; - } if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) error(1, 0, "serr: wrong origin: %u", serr->ee_origin); if (serr->ee_errno != 0) @@ -440,20 +458,20 @@ static bool do_recv_completion(int fd) } /* Read all outstanding messages on the errqueue */ -static void do_recv_completions(int fd) +static void do_recv_completions(int fd, int domain) { - while (do_recv_completion(fd)) {} + while (do_recv_completion(fd, domain)) {} } /* Wait for all remaining completions on the errqueue */ -static void do_recv_remaining_completions(int fd) +static void do_recv_remaining_completions(int fd, int domain) { int64_t tstop = gettimeofday_ms() + cfg_waittime_ms; while (completions < expected_completions && gettimeofday_ms() < tstop) { - if (do_poll(fd, POLLERR)) - do_recv_completions(fd); + if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR)) + do_recv_completions(fd, domain); } if (completions < expected_completions) @@ -534,13 +552,13 @@ static void do_tx(int domain, int type, int protocol) while (!do_poll(fd, POLLOUT)) { if (cfg_zerocopy) - do_recv_completions(fd); + do_recv_completions(fd, domain); } } while (gettimeofday_ms() < tstop); if (cfg_zerocopy) - do_recv_remaining_completions(fd); + do_recv_remaining_completions(fd, domain); if (close(fd)) error(1, errno, "close"); @@ -631,40 +649,6 @@ static void do_flush_datagram(int fd, int type) bytes += cfg_payload_len; } - -static void do_recvmsg(int fd) -{ - int ret, off = 0; - char *buf; - struct iovec iov; - struct msghdr msg; - struct sockaddr_storage din; - - buf = calloc(cfg_payload_len, sizeof(char)); - iov.iov_base = buf; - iov.iov_len = cfg_payload_len; - - memset(&msg, 0, sizeof(msg)); - msg.msg_name = &din; - msg.msg_namelen = sizeof(din); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - ret = recvmsg(fd, &msg, MSG_TRUNC); - - if (ret == -1) - error(1, errno, "recv"); - if (ret != cfg_payload_len) - error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len); - - if (memcmp(buf + off, payload, ret)) - error(1, 0, "recv: data mismatch"); - - free(buf); - packets++; - bytes += cfg_payload_len; -} - static void do_rx(int domain, int type, int protocol) { uint64_t tstop; @@ -676,8 +660,6 @@ static void do_rx(int domain, int type, int protocol) do { if (type == SOCK_STREAM) do_flush_tcp(fd); - else if (domain == PF_RDS) - do_recvmsg(fd); else do_flush_datagram(fd, type); |