From b50b0580d27bc45a0637aefc8bac4d31aa85771a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:48:57 +0100 Subject: net: add queue argument to __skb_wait_for_more_packets and __skb_{,try_}recv_datagram This will be used by ESP over TCP to handle the queue of IKE messages. Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/unix/af_unix.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7cfdce10de36..a7f707fc4cac 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2058,8 +2058,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, - &last); + skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, + NULL, &skip, &err, &last); if (skb) break; @@ -2068,7 +2068,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (err != -EAGAIN) break; } while (timeo && - !__skb_wait_for_more_packets(sk, &err, &timeo, last)); + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); -- cgit v1.2.3-59-g8ed1b From 3c32da19a858fb1ae8a76bf899160be49f338506 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 9 Dec 2019 13:03:46 +0300 Subject: unix: Show number of pending scm files of receive queue in fdinfo Unix sockets like a block box. You never know what is stored there: there may be a file descriptor holding a mount or a block device, or there may be whole universes with namespaces, sockets with receive queues full of sockets etc. The patch adds a little debug and accounts number of files (not recursive), which is in receive queue of a unix socket. Sometimes this is useful to determine, that socket should be investigated or which task should be killed to put reference counter on a resourse. v2: Pass correct argument to lockdep Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/af_unix.h | 5 +++++ net/unix/af_unix.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 3426d6dacc45..17e10fba2152 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -41,6 +41,10 @@ struct unix_skb_parms { u32 consumed; } __randomize_layout; +struct scm_stat { + u32 nr_fds; +}; + #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) @@ -65,6 +69,7 @@ struct unix_sock { #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; + struct scm_stat scm_stat; }; static inline struct unix_sock *unix_sk(const struct sock *sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7cfdce10de36..050eed3b30dc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -676,6 +676,16 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } +static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) +{ + struct sock *sk = sock->sk; + struct unix_sock *u; + + if (sk) { + u = unix_sk(sock->sk); + seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds)); + } +} static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, @@ -701,6 +711,7 @@ static const struct proto_ops unix_stream_ops = { .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { @@ -726,6 +737,7 @@ static const struct proto_ops unix_dgram_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { @@ -751,6 +763,7 @@ static const struct proto_ops unix_seqpacket_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static struct proto unix_proto = { @@ -788,6 +801,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); + memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -1572,6 +1586,28 @@ static bool unix_skb_scm_eq(struct sk_buff *skb, unix_secdata_eq(scm, skb); } +static void scm_stat_add(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds += fp->count; +} + +static void scm_stat_del(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds -= fp->count; +} + /* * Send AF_UNIX data. */ @@ -1757,7 +1793,10 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); @@ -1859,7 +1898,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto pipe_err_free; maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2058,8 +2100,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, - &last); + skb = __skb_try_recv_datagram(sk, flags, scm_stat_del, + &skip, &err, &last); if (skb) break; @@ -2353,8 +2395,12 @@ unlock: sk_peek_offset_bwd(sk, chunk); - if (UNIXCB(skb).fp) + if (UNIXCB(skb).fp) { + spin_lock(&sk->sk_receive_queue.lock); + scm_stat_del(sk, skb); + spin_unlock(&sk->sk_receive_queue.lock); unix_detach_fds(&scm, skb); + } if (unix_skb_len(skb)) break; -- cgit v1.2.3-59-g8ed1b From 86b18aaa2b5b5bb48e609cd591b3d2d0fdbe0442 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 4 Feb 2020 13:40:29 -0500 Subject: skbuff: fix a data race in skb_queue_len() sk_buff.qlen can be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in __skb_try_recv_from_queue / unix_dgram_sendmsg read to 0xffff8a1b1d8a81c0 of 4 bytes by task 5371 on cpu 96: unix_dgram_sendmsg+0x9a9/0xb70 include/linux/skbuff.h:1821 net/unix/af_unix.c:1761 ____sys_sendmsg+0x33e/0x370 ___sys_sendmsg+0xa6/0xf0 __sys_sendmsg+0x69/0xf0 __x64_sys_sendmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe write to 0xffff8a1b1d8a81c0 of 4 bytes by task 1 on cpu 99: __skb_try_recv_from_queue+0x327/0x410 include/linux/skbuff.h:2029 __skb_try_recv_datagram+0xbe/0x220 unix_dgram_recvmsg+0xee/0x850 ____sys_recvmsg+0x1fb/0x210 ___sys_recvmsg+0xa2/0xf0 __sys_recvmsg+0x66/0xf0 __x64_sys_recvmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe Since only the read is operating as lockless, it could introduce a logic bug in unix_recvq_full() due to the load tearing. Fix it by adding a lockless variant of skb_queue_len() and unix_recvq_full() where READ_ONCE() is on the read while WRITE_ONCE() is on the write similar to the commit d7d16a89350a ("net: add skb_queue_empty_lockless()"). Signed-off-by: Qian Cai Signed-off-by: David S. Miller --- include/linux/skbuff.h | 14 +++++++++++++- net/unix/af_unix.c | 11 +++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3d13a4b717e9..ca8806b69388 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1821,6 +1821,18 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) return list_->qlen; } +/** + * skb_queue_len_lockless - get queue length + * @list_: list to measure + * + * Return the length of an &sk_buff queue. + * This variant can be used in lockless contexts. + */ +static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) +{ + return READ_ONCE(list_->qlen); +} + /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize @@ -2026,7 +2038,7 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; - list->qlen--; + WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 321af97c7bbe..62c12cb5763e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -189,11 +189,17 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk) return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } -static inline int unix_recvq_full(struct sock const *sk) +static inline int unix_recvq_full(const struct sock *sk) { return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } +static inline int unix_recvq_full_lockless(const struct sock *sk) +{ + return skb_queue_len_lockless(&sk->sk_receive_queue) > + READ_ONCE(sk->sk_max_ack_backlog); +} + struct sock *unix_peer_get(struct sock *s) { struct sock *peer; @@ -1758,7 +1764,8 @@ restart_locked: * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && - unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + unlikely(unix_peer(other) != sk && + unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); -- cgit v1.2.3-59-g8ed1b