From 1fbfdfaa590248c1d86407f578e40e5c65136330 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:11 -0700 Subject: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. We will replace the garbage collection algorithm for AF_UNIX, where we will consider each inflight AF_UNIX socket as a vertex and its file descriptor as an edge in a directed graph. This patch introduces a new struct unix_vertex representing a vertex in the graph and adds its pointer to struct unix_sock. When we send a fd using the SCM_RIGHTS message, we allocate struct scm_fp_list to struct scm_cookie in scm_fp_copy(). Then, we bump each refcount of the inflight fds' struct file and save them in scm_fp_list.fp. After that, unix_attach_fds() inexplicably clones scm_fp_list of scm_cookie and sets it to skb. (We will remove this part after replacing GC.) Here, we add a new function call in unix_attach_fds() to preallocate struct unix_vertex per inflight AF_UNIX fd and link each vertex to skb's scm_fp_list.vertices. When sendmsg() succeeds later, if the socket of the inflight fd is still not inflight yet, we will set the preallocated vertex to struct unix_sock.vertex and link it to a global list unix_unvisited_vertices under spin_lock(&unix_gc_lock). If the socket is already inflight, we free the preallocated vertex. This is to avoid taking the lock unnecessarily when sendmsg() could fail later. In the following patch, we will similarly allocate another struct per edge, which will finally be linked to the inflight socket's unix_vertex.edges. And then, we will count the number of edges as unix_vertex.out_degree. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5b41e2321209..a3b25d311560 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -980,6 +980,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->inflight = 0; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); @@ -1805,6 +1806,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) unix_inflight(scm->fp->user, scm->fp->fp[i]); + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + return 0; } @@ -1815,6 +1819,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; + unix_destroy_fpl(scm->fp); + for (i = scm->fp->count - 1; i >= 0; i--) unix_notinflight(scm->fp->user, scm->fp->fp[i]); } -- cgit v1.2.3-59-g8ed1b From 42f298c06b30bfe0a8cbee5d38644e618699e26e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:13 -0700 Subject: af_unix: Link struct unix_edge when queuing skb. Just before queuing skb with inflight fds, we call scm_stat_add(), which is a good place to set up the preallocated struct unix_vertex and struct unix_edge in UNIXCB(skb).fp. Then, we call unix_add_edges() and construct the directed graph as follows: 1. Set the inflight socket's unix_sock to unix_edge.predecessor. 2. Set the receiver's unix_sock to unix_edge.successor. 3. Set the preallocated vertex to inflight socket's unix_sock.vertex. 4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices. 5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges. Let's say we pass the fd of AF_UNIX socket A to B and the fd of B to C. The graph looks like this: +-------------------------+ | unix_unvisited_vertices | <-------------------------. +-------------------------+ | + | | +--------------+ +--------------+ | +--------------+ | | unix_sock A | <---. .---> | unix_sock B | <-|-. .---> | unix_sock C | | +--------------+ | | +--------------+ | | | +--------------+ | .-+ | vertex | | | .-+ | vertex | | | | | vertex | | | +--------------+ | | | +--------------+ | | | +--------------+ | | | | | | | | | | +--------------+ | | | +--------------+ | | | | '-> | unix_vertex | | | '-> | unix_vertex | | | | | +--------------+ | | +--------------+ | | | `---> | entry | +---------> | entry | +-' | | |--------------| | | |--------------| | | | edges | <-. | | | edges | <-. | | +--------------+ | | | +--------------+ | | | | | | | | | .----------------------' | | .----------------------' | | | | | | | | | +--------------+ | | | +--------------+ | | | | unix_edge | | | | | unix_edge | | | | +--------------+ | | | +--------------+ | | `-> | vertex_entry | | | `-> | vertex_entry | | | |--------------| | | |--------------| | | | predecessor | +---' | | predecessor | +---' | |--------------| | |--------------| | | successor | +-----' | successor | +-----' +--------------+ +--------------+ Henceforth, we denote such a graph as A -> B (-> C). Now, we can express all inflight fd graphs that do not contain embryo sockets. We will support the particular case later. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 2 ++ include/net/scm.h | 1 + net/core/scm.c | 2 ++ net/unix/af_unix.c | 8 +++-- net/unix/garbage.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 100 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 55c4abc26a71..f31ad1166346 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -22,6 +22,8 @@ extern unsigned int unix_tot_inflight; void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); diff --git a/include/net/scm.h b/include/net/scm.h index 5f5154e5096d..bbc5527809d1 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -32,6 +32,7 @@ struct scm_fp_list { short count_unix; short max; #ifdef CONFIG_UNIX + bool inflight; struct list_head vertices; struct unix_edge *edges; #endif diff --git a/net/core/scm.c b/net/core/scm.c index 1bcc8a2d65e3..5763f3320358 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->max = SCM_MAX_FD; fpl->user = NULL; #if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; fpl->edges = NULL; INIT_LIST_HEAD(&fpl->vertices); #endif @@ -384,6 +385,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); #if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; new_fpl->edges = NULL; INIT_LIST_HEAD(&new_fpl->vertices); #endif diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a3b25d311560..24adbc4d5188 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1943,8 +1943,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1952,8 +1954,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* diff --git a/net/unix/garbage.c b/net/unix/garbage.c index f31917683288..36d665936096 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,38 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static LIST_HEAD(unix_unvisited_vertices); + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; + } + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); +} + +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + list_del(&edge->vertex_entry); + vertex->out_degree--; + + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); + } +} + static void unix_free_vertices(struct scm_fp_list *fpl) { struct unix_vertex *vertex, *next_vertex; @@ -111,6 +143,60 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } } +DEFINE_SPINLOCK(unix_gc_lock); + +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) +{ + int i = 0, j = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; + + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + +out: + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); +} + +void unix_del_edges(struct scm_fp_list *fpl) +{ + int i = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_edge *edge = fpl->edges + i++; + + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + +out: + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +} + int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; @@ -141,11 +227,13 @@ err: void unix_destroy_fpl(struct scm_fp_list *fpl) { + if (fpl->inflight) + unix_del_edges(fpl); + kvfree(fpl->edges); unix_free_vertices(fpl); } -DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); static LIST_HEAD(gc_inflight_list); -- cgit v1.2.3-59-g8ed1b From aed6ecef55d70de3762ce41c561b7f547dbaf107 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:17 -0700 Subject: af_unix: Save listener for embryo socket. This is a prep patch for the following change, where we need to fetch the listening socket from the successor embryo socket during GC. We add a new field to struct unix_sock to save a pointer to a listening socket. We set it when connect() creates a new socket, and clear it when accept() is called. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 67736767b616..dc7469191195 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -83,6 +83,7 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; + struct sock *listener; struct unix_vertex *vertex; struct list_head link; unsigned long inflight; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 24adbc4d5188..af74e7ebc35a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -979,6 +979,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); + u->listener = NULL; u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; @@ -1598,6 +1599,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1693,8 +1695,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1719,6 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; + unix_sk(tsk)->listener = NULL; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); -- cgit v1.2.3-59-g8ed1b From dcf70df2048d27c5d186f013f101a4aefd63aa41 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:18 -0700 Subject: af_unix: Fix up unix_edge.successor for embryo socket. To garbage collect inflight AF_UNIX sockets, we must define the cyclic reference appropriately. This is a bit tricky if the loop consists of embryo sockets. Suppose that the fd of AF_UNIX socket A is passed to D and the fd B to C and that C and D are embryo sockets of A and B, respectively. It may appear that there are two separate graphs, A (-> D) and B (-> C), but this is not correct. A --. .-- B X C <-' `-> D Now, D holds A's refcount, and C has B's refcount, so unix_release() will never be called for A and B when we close() them. However, no one can call close() for D and C to free skbs holding refcounts of A and B because C/D is in A/B's receive queue, which should have been purged by unix_release() for A and B. So, here's another type of cyclic reference. When a fd of an AF_UNIX socket is passed to an embryo socket, the reference is indirectly held by its parent listening socket. .-> A .-> B | `- sk_receive_queue | `- sk_receive_queue | `- skb | `- skb | `- sk == C | `- sk == D | `- sk_receive_queue | `- sk_receive_queue | `- skb +---------' `- skb +-. | | `---------------------------------------------------------' Technically, the graph must be denoted as A <-> B instead of A (-> D) and B (-> C) to find such a cyclic reference without touching each socket's receive queue. .-> A --. .-- B <-. | X | == A <-> B `-- C <-' `-> D --' We apply this fixup during GC by fetching the real successor by unix_edge_successor(). When we call accept(), we clear unix_sock.listener under unix_gc_lock not to confuse GC. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 2 +- net/unix/garbage.c | 20 +++++++++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index dc7469191195..414463803b7e 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -24,6 +24,7 @@ void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index af74e7ebc35a..ae77e2dc0dae 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1721,7 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; - unix_sk(tsk)->listener = NULL; + unix_update_edges(unix_sk(tsk)); skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 33aadaa35346..8d0912c1d01a 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,17 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) +{ + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. + */ + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { @@ -209,6 +220,13 @@ out: fpl->inflight = false; } +void unix_update_edges(struct unix_sock *receiver) +{ + spin_lock(&unix_gc_lock); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); +} + int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; @@ -268,7 +286,7 @@ next_vertex: /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { - struct unix_vertex *next_vertex = edge->successor->vertex; + struct unix_vertex *next_vertex = unix_edge_successor(edge); if (!next_vertex) continue; -- cgit v1.2.3-59-g8ed1b From 4090fa373f0e763c43610853d2774b5979915959 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:24 -0700 Subject: af_unix: Replace garbage collection algorithm. If we find a dead SCC during iteration, we call unix_collect_skb() to splice all skb in the SCC to the global sk_buff_head, hitlist. After iterating all SCC, we unlock unix_gc_lock and purge the queue. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 8 -- net/unix/af_unix.c | 12 -- net/unix/garbage.c | 302 +++++++++++--------------------------------------- 3 files changed, 64 insertions(+), 258 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 696d997a5ac9..226a8da2cbe3 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) extern spinlock_t unix_gc_lock; extern unsigned int unix_tot_inflight; - -void unix_inflight(struct user_struct *user, struct file *fp); -void unix_notinflight(struct user_struct *user, struct file *fp); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); void unix_update_edges(struct unix_sock *receiver); @@ -85,12 +82,7 @@ struct unix_sock { struct sock *peer; struct sock *listener; struct unix_vertex *vertex; - struct list_head link; - unsigned long inflight; spinlock_t lock; - unsigned long gc_flags; -#define UNIX_GC_CANDIDATE 0 -#define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; struct scm_stat scm_stat; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ae77e2dc0dae..27ca50ab1cd1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -980,12 +980,10 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->listener = NULL; - u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1793,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p) static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - if (too_many_unix_fds(current)) return -ETOOMANYREFS; @@ -1806,9 +1802,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; @@ -1817,15 +1810,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; unix_destroy_fpl(scm->fp); - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0a6b38da578c..89ea71d9297b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix_vertex *vertex) return true; } +enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_NORMAL, + U_RECVQ_LOCK_EMBRYO, +}; + +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) +{ + struct unix_vertex *vertex; + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + skb_queue_splice_init(embryo_queue, hitlist); + spin_unlock(&embryo_queue->lock); + } + } else { + skb_queue_splice_init(queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + } + + spin_unlock(&queue->lock); + } +} + static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; @@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_head *scc) static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; -static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index) +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) { LIST_HEAD(vertex_stack); struct unix_edge *edge; @@ -430,7 +477,9 @@ prev_vertex: scc_dead = unix_vertex_dead(vertex); } - if (!unix_graph_maybe_cyclic) + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); list_del(&scc); @@ -441,7 +490,7 @@ prev_vertex: goto prev_vertex; } -static void unix_walk_scc(void) +static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; @@ -454,7 +503,7 @@ static void unix_walk_scc(void) struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); - __unix_walk_scc(vertex, &last_index); + __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); @@ -463,7 +512,7 @@ static void unix_walk_scc(void) unix_graph_grouped = true; } -static void unix_walk_scc_fast(void) +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; @@ -480,263 +529,40 @@ static void unix_walk_scc_fast(void) scc_dead = unix_vertex_dead(vertex); } + if (scc_dead) + unix_collect_skb(&scc, hitlist); + list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } -static LIST_HEAD(gc_candidates); -static LIST_HEAD(gc_inflight_list); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *filp) -{ - struct unix_sock *u = unix_get_socket(filp); - - spin_lock(&unix_gc_lock); - - if (u) { - if (!u->inflight) { - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - WARN_ON_ONCE(list_empty(&u->link)); - } - u->inflight++; - } - - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *filp) -{ - struct unix_sock *u = unix_get_socket(filp); - - spin_lock(&unix_gc_lock); - - if (u) { - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - } - - spin_unlock(&unix_gc_lock); -} - -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) -{ - struct sk_buff *skb; - struct sk_buff *next; - - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; - - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct unix_sock *u = unix_get_socket(*fp++); - - /* Ignore non-candidates, they could have been added - * to the queues after starting the garbage collection - */ - if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } - } - spin_unlock(&x->sk_receive_queue.lock); -} - -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) -{ - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); - - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); - - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); - - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } - } -} - -static void dec_inflight(struct unix_sock *usk) -{ - usk->inflight--; -} - -static void inc_inflight(struct unix_sock *usk) -{ - usk->inflight++; -} - -static void inc_inflight_move_tail(struct unix_sock *u) -{ - u->inflight++; - - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over - */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); -} - static bool gc_in_progress; static void __unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; - struct unix_sock *u, *next; - LIST_HEAD(not_cycle_list); - struct list_head cursor; spin_lock(&unix_gc_lock); - if (!unix_graph_maybe_cyclic) + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); goto skip_gc; - - if (unix_graph_grouped) - unix_walk_scc_fast(); - else - unix_walk_scc(); - - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - long total_refs; - - total_refs = file_count(u->sk.sk_socket->file); - - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - } - } - - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); - - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. - */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); - - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); - - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); - } } - list_del(&cursor); - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); + __skb_queue_head_init(&hitlist); -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif - } - - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. - */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); - } + if (unix_graph_grouped) + unix_walk_scc_fast(&hitlist); + else + unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); - /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); - - spin_lock(&unix_gc_lock); - - /* All candidates should have been detached by now. */ - WARN_ON_ONCE(!list_empty(&gc_candidates)); skip_gc: - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); - - spin_unlock(&unix_gc_lock); } static DECLARE_WORK(unix_gc_work, __unix_gc); -- cgit v1.2.3-59-g8ed1b From 1abe267f173eae7ae76cf56232292e9641eb652f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 14:40:32 +0000 Subject: net: add sk_wake_async_rcu() helper While looking at UDP receive performance, I saw sk_wake_async() was no longer inlined. This matters at least on AMD Zen1-4 platforms (see SRSO) This might be because rcu_read_lock() and rcu_read_unlock() are no longer nops in recent kernels ? Add sk_wake_async_rcu() variant, which must be called from contexts already holding rcu lock. As SOCK_FASYNC is deprecated in modern days, use unlikely() to give a hint to the compiler. sk_wake_async_rcu() is properly inlined from __udp_enqueue_schedule_skb() and sock_def_readable(). Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- crypto/af_alg.c | 4 ++-- include/net/sock.h | 6 ++++++ net/atm/common.c | 2 +- net/core/sock.c | 8 ++++---- net/dccp/output.c | 2 +- net/ipv4/udp.c | 2 +- net/iucv/af_iucv.c | 2 +- net/rxrpc/af_rxrpc.c | 2 +- net/sctp/socket.c | 2 +- net/smc/smc_rx.c | 4 ++-- net/unix/af_unix.c | 2 +- 11 files changed, 21 insertions(+), 15 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 68cc9290cabe..5bc6d0fa7498 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -847,7 +847,7 @@ void af_alg_wmem_wakeup(struct sock *sk) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); @@ -914,7 +914,7 @@ static void af_alg_data_wakeup(struct sock *sk) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/include/net/sock.h b/include/net/sock.h index f57bfd8a2ad2..2253eefe2848 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2513,6 +2513,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) } } +static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) +{ + if (unlikely(sock_flag(sk, SOCK_FASYNC))) + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); +} + /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at diff --git a/net/atm/common.c b/net/atm/common.c index 2a1ec014e901..9b75699992ff 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index 0963689a5950..5ed411231fc7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3338,7 +3338,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3353,7 +3353,7 @@ void sock_def_readable(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3373,7 +3373,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3398,7 +3398,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } diff --git a/net/dccp/output.c b/net/dccp/output.c index fd2eb148d24d..5c2e24f3c39b 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk) wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 143043cd2dcb..11460d751e73 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1544,7 +1544,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); } busylock_release(busy); return 0; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 7c8c3adcac6e..c951bb9cc2e0 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5222bc97d192..f4844683e120 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c67679a41044..e416b6d3d270 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9276,7 +9276,7 @@ void sctp_data_ready(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 9a2f3638d161..f0cbe77a80b4 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); rcu_read_unlock(); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 27ca50ab1cd1..533fb682c954 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } -- cgit v1.2.3-59-g8ed1b From 7c349ed090318b1c88a3e5dff3b24f732296edce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 1 Apr 2024 10:31:24 -0700 Subject: af_unix: Remove scm_fp_dup() in unix_attach_fds(). When we passed fds, we used to bump each file's refcount twice in scm_fp_copy() and scm_fp_dup() before linking the socket to gc_inflight_list. This is because we incremented the inflight count of the socket and linked it to the list in advance before passing skb to the destination socket. Otherwise, the inflight socket could have been garbage-collected in a small race window between linking the socket to the list and queuing skb: CPU 1 : sendmsg(X) w/ A's fd CPU 2 : close(A) ----- ----- /* Here A's refcount is 1, and inflight count is 0 */ bump A's refcount to 2 in scm_fp_copy() bump A's inflight count to 1 link A to gc_inflight_list decrement A's refcount to 1 /* A's refcount == inflight count, thus A could be GC candidate */ start GC mark A as candidate purge A's receive queue queue skb w/ A's fd to X /* A is queued, but all data has been lost */ After commit 4090fa373f0e ("af_unix: Replace garbage collection algorithm."), we increment the inflight count and link the socket to the global list only when queuing the skb. The race no longer exists, so let's not clone the fd nor bump the count in unix_attach_fds(). Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240401173125.92184-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 533fb682c954..78be8b520cef 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1794,13 +1794,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (too_many_unix_fds(current)) return -ETOOMANYREFS; - /* Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; + UNIXCB(skb).fp = scm->fp; + scm->fp = NULL; if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 118f457da9ed58a79e24b73c2ef0aa1987241f0e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 1 Apr 2024 10:31:25 -0700 Subject: af_unix: Remove lock dance in unix_peek_fds(). In the previous GC implementation, the shape of the inflight socket graph was not expected to change while GC was in progress. MSG_PEEK was tricky because it could install inflight fd silently and transform the graph. Let's say we peeked a fd, which was a listening socket, and accept()ed some embryo sockets from it. The garbage collection algorithm would have been confused because the set of sockets visited in scan_inflight() would change within the same GC invocation. That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in unix_peek_fds() with a fat comment. In the new GC implementation, we no longer garbage-collect the socket if it exists in another queue, that is, if it has a bridge to another SCC. Also, accept() will require the lock if it has edges. Thus, we need not do the complicated lock dance. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240401173125.92184-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 - net/unix/af_unix.c | 42 ------------------------------------------ net/unix/garbage.c | 2 +- 3 files changed, 1 insertion(+), 44 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 226a8da2cbe3..7311b77edfc7 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -17,7 +17,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif -extern spinlock_t unix_gc_lock; extern unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 78be8b520cef..61ecfa9c9c6b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1814,48 +1814,6 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); - - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); } static void unix_destruct_scm(struct sk_buff *skb) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 89ea71d9297b..12a4ec27e0d4 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -183,7 +183,7 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } } -DEFINE_SPINLOCK(unix_gc_lock); +static DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) -- cgit v1.2.3-59-g8ed1b From b46f4eaa4f0ec38909fb0072eea3aeddb32f954e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 5 Apr 2024 15:10:57 -0700 Subject: af_unix: Clear stale u->oob_skb. syzkaller started to report deadlock of unix_gc_lock after commit 4090fa373f0e ("af_unix: Replace garbage collection algorithm."), but it just uncovers the bug that has been there since commit 314001f0bf92 ("af_unix: Add OOB support"). The repro basically does the following. from socket import * from array import array c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) c1.sendmsg([b'a'], [(SOL_SOCKET, SCM_RIGHTS, array("i", [c2.fileno()]))], MSG_OOB) c2.recv(1) # blocked as no normal data in recv queue c2.close() # done async and unblock recv() c1.close() # done async and trigger GC A socket sends its file descriptor to itself as OOB data and tries to receive normal data, but finally recv() fails due to async close(). The problem here is wrong handling of OOB skb in manage_oob(). When recvmsg() is called without MSG_OOB, manage_oob() is called to check if the peeked skb is OOB skb. In such a case, manage_oob() pops it out of the receive queue but does not clear unix_sock(sk)->oob_skb. This is wrong in terms of uAPI. Let's say we send "hello" with MSG_OOB, and "world" without MSG_OOB. The 'o' is handled as OOB data. When recv() is called twice without MSG_OOB, the OOB data should be lost. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM, 0) >>> c1.send(b'hello', MSG_OOB) # 'o' is OOB data 5 >>> c1.send(b'world') 5 >>> c2.recv(5) # OOB data is not received b'hell' >>> c2.recv(5) # OOB date is skipped b'world' >>> c2.recv(5, MSG_OOB) # This should return an error b'o' In the same situation, TCP actually returns -EINVAL for the last recv(). Also, if we do not clear unix_sk(sk)->oob_skb, unix_poll() always set EPOLLPRI even though the data has passed through by previous recv(). To avoid these issues, we must clear unix_sk(sk)->oob_skb when dequeuing it from recv queue. The reason why the old GC did not trigger the deadlock is because the old GC relied on the receive queue to detect the loop. When it is triggered, the socket with OOB data is marked as GC candidate because file refcount == inflight count (1). However, after traversing all inflight sockets, the socket still has a positive inflight count (1), thus the socket is excluded from candidates. Then, the old GC lose the chance to garbage-collect the socket. With the old GC, the repro continues to create true garbage that will never be freed nor detected by kmemleak as it's linked to the global inflight list. That's why we couldn't even notice the issue. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: syzbot+7f7f201cc2668a8fd169@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7f7f201cc2668a8fd169 Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240405221057.2406-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5b41e2321209..d032eb5fa6df 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2665,7 +2665,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } } else if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); - consume_skb(skb); + WRITE_ONCE(u->oob_skb, NULL); + if (!WARN_ON_ONCE(skb_unref(skb))) + kfree_skb(skb); skb = skb_peek(&sk->sk_receive_queue); } } -- cgit v1.2.3-59-g8ed1b From 283454c8a123072e5c386a5a2b5fc576aa455b6f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:15 -0700 Subject: af_unix: Call manage_oob() for every skb in unix_stream_read_generic(). When we call recv() for AF_UNIX socket, we first peek one skb and calls manage_oob() to check if the skb is sent with MSG_OOB. However, when we fetch the next (and the following) skb, manage_oob() is not called now, leading a wrong behaviour. Let's say a socket send()s "hello" with MSG_OOB and the peer tries to recv() 5 bytes with MSG_PEEK. Here, we should get only "hell" without 'o', but actually not: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hello' The first skb fills 4 bytes, and the next skb is peeked but not properly checked by manage_oob(). Let's move up the again label to call manage_oob() for evry skb. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hell' Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d032eb5fa6df..f297320438bf 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2741,6 +2741,7 @@ redo: last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; +again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); @@ -2752,7 +2753,6 @@ redo: } } #endif -again: if (skb == NULL) { if (copied >= target) goto unlock; -- cgit v1.2.3-59-g8ed1b From 22dd70eb2c3d754862964377a75abafd3167346b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:16 -0700 Subject: af_unix: Don't peek OOB data without MSG_OOB. Currently, we can read OOB data without MSG_OOB by using MSG_PEEK when OOB data is sitting on the front row, which is apparently wrong. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'a' If manage_oob() is called when no data has been copied, we only check if the socket enables SO_OOBINLINE or MSG_PEEK is not used. Otherwise, the skb is returned as is. However, here we should return NULL if MSG_PEEK is set and no data has been copied. Also, in such a case, we should not jump to the redo label because we will be caught in the loop and hog the CPU until normal data comes in. Then, we need to handle skb == NULL case with the if-clause below the manage_oob() block. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) Traceback (most recent call last): File "", line 1, in BlockingIOError: [Errno 11] Resource temporarily unavailable Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f297320438bf..9a6ad5974dff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2663,7 +2663,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } - } else if (!(flags & MSG_PEEK)) { + } else if (flags & MSG_PEEK) { + skb = NULL; + } else { skb_unlink(skb, &sk->sk_receive_queue); WRITE_ONCE(u->oob_skb, NULL); if (!WARN_ON_ONCE(skb_unref(skb))) @@ -2745,11 +2747,9 @@ again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); - if (!skb) { + if (!skb && copied) { unix_state_unlock(sk); - if (copied) - break; - goto redo; + break; } } #endif -- cgit v1.2.3-59-g8ed1b From fd86344823b521149bb31d91eba900ba3525efa6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 12 Apr 2024 19:19:28 -0700 Subject: af_unix: Try not to hold unix_gc_lock during accept(). Commit dcf70df2048d ("af_unix: Fix up unix_edge.successor for embryo socket.") added spin_lock(&unix_gc_lock) in accept() path, and it caused regression in a stress test as reported by kernel test robot. If the embryo socket is not part of the inflight graph, we need not hold the lock. To decide that in O(1) time and avoid the regression in the normal use case, 1. add a new stat unix_sk(sk)->scm_stat.nr_unix_fds 2. count the number of inflight AF_UNIX sockets in the receive queue under unix_state_lock() 3. move unix_update_edges() call under unix_state_lock() 4. avoid locking if nr_unix_fds is 0 in unix_update_edges() Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404101427.92a08551-oliver.sang@intel.com Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240413021928.20946-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 2 +- net/unix/garbage.c | 20 ++++++++++++++++---- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 7311b77edfc7..872ff2a50372 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -67,6 +67,7 @@ struct unix_skb_parms { struct scm_stat { atomic_t nr_fds; + unsigned long nr_unix_fds; }; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 142d210b5b03..ed16d5f66df8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1719,12 +1719,12 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; - unix_update_edges(unix_sk(tsk)); skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 12a4ec27e0d4..95240a59808f 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -209,6 +209,7 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) unix_add_edge(fpl, edge); } while (i < fpl->count_unix); + receiver->scm_stat.nr_unix_fds += fpl->count_unix; WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); @@ -222,6 +223,7 @@ out: void unix_del_edges(struct scm_fp_list *fpl) { + struct unix_sock *receiver; int i = 0; spin_lock(&unix_gc_lock); @@ -235,6 +237,8 @@ void unix_del_edges(struct scm_fp_list *fpl) unix_del_edge(fpl, edge); } while (i < fpl->count_unix); + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); @@ -246,10 +250,18 @@ out: void unix_update_edges(struct unix_sock *receiver) { - spin_lock(&unix_gc_lock); - unix_update_graph(unix_sk(receiver->listener)->vertex); - receiver->listener = NULL; - spin_unlock(&unix_gc_lock); + /* nr_unix_fds is only updated under unix_state_lock(). + * If it's 0 here, the embryo socket is not part of the + * inflight graph, and GC will not see it, so no lock needed. + */ + if (!receiver->scm_stat.nr_unix_fds) { + receiver->listener = NULL; + } else { + spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); + } } int unix_prepare_fpl(struct scm_fp_list *fpl) -- cgit v1.2.3-59-g8ed1b From 540bf24fba16b88c1b3b9353927204b4f1074e25 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 9 May 2024 01:14:46 -0700 Subject: af_unix: Fix data races in unix_release_sock/unix_stream_sendmsg A data-race condition has been identified in af_unix. In one data path, the write function unix_release_sock() atomically writes to sk->sk_shutdown using WRITE_ONCE. However, on the reader side, unix_stream_sendmsg() does not read it atomically. Consequently, this issue is causing the following KCSAN splat to occur: BUG: KCSAN: data-race in unix_release_sock / unix_stream_sendmsg write (marked) to 0xffff88867256ddbb of 1 bytes by task 7270 on cpu 28: unix_release_sock (net/unix/af_unix.c:640) unix_release (net/unix/af_unix.c:1050) sock_close (net/socket.c:659 net/socket.c:1421) __fput (fs/file_table.c:422) __fput_sync (fs/file_table.c:508) __se_sys_close (fs/open.c:1559 fs/open.c:1541) __x64_sys_close (fs/open.c:1541) x64_sys_call (arch/x86/entry/syscall_64.c:33) do_syscall_64 (arch/x86/entry/common.c:?) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) read to 0xffff88867256ddbb of 1 bytes by task 989 on cpu 14: unix_stream_sendmsg (net/unix/af_unix.c:2273) __sock_sendmsg (net/socket.c:730 net/socket.c:745) ____sys_sendmsg (net/socket.c:2584) __sys_sendmmsg (net/socket.c:2638 net/socket.c:2724) __x64_sys_sendmmsg (net/socket.c:2753 net/socket.c:2750 net/socket.c:2750) x64_sys_call (arch/x86/entry/syscall_64.c:33) do_syscall_64 (arch/x86/entry/common.c:?) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) value changed: 0x01 -> 0x03 The line numbers are related to commit dd5a440a31fa ("Linux 6.9-rc7"). Commit e1d09c2c2f57 ("af_unix: Fix data races around sk->sk_shutdown.") addressed a comparable issue in the past regarding sk->sk_shutdown. However, it overlooked resolving this particular data path. This patch only offending unix_stream_sendmsg() function, since the other reads seem to be protected by unix_state_lock() as discussed in Link: https://lore.kernel.org/all/20240508173324.53565-1-kuniyu@amazon.com/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240509081459.2807828-1-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9a6ad5974dff..e94839d89b09 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2270,7 +2270,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_err; } - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto pipe_err; while (sent < len) { -- cgit v1.2.3-59-g8ed1b From 92ef0fd55ac80dfc2e4654edfe5d1ddfa6e070fe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 May 2024 09:20:08 -0600 Subject: net: change proto and proto_ops accept type Rather than pass in flags, error pointer, and whether this is a kernel invocation or not, add a struct proto_accept_arg struct as the argument. This then holds all of these arguments, and prepares accept for being able to pass back more information. No functional changes in this patch. Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- crypto/af_alg.c | 11 ++++++----- crypto/algif_hash.c | 10 +++++----- drivers/xen/pvcalls-back.c | 6 +++++- fs/ocfs2/cluster/tcp.c | 5 ++++- include/crypto/if_alg.h | 3 ++- include/linux/net.h | 4 +++- include/net/inet_common.h | 4 ++-- include/net/inet_connection_sock.h | 2 +- include/net/sock.h | 12 +++++++++--- net/atm/svc.c | 8 ++++---- net/ax25/af_ax25.c | 6 +++--- net/bluetooth/iso.c | 4 ++-- net/bluetooth/l2cap_sock.c | 4 ++-- net/bluetooth/rfcomm/sock.c | 6 +++--- net/bluetooth/sco.c | 4 ++-- net/core/sock.c | 4 ++-- net/ipv4/af_inet.c | 10 +++++----- net/ipv4/inet_connection_sock.c | 6 +++--- net/iucv/af_iucv.c | 4 ++-- net/llc/af_llc.c | 7 +++---- net/mptcp/protocol.c | 11 +++++------ net/netrom/af_netrom.c | 6 +++--- net/nfc/llcp_sock.c | 4 ++-- net/phonet/pep.c | 12 ++++++------ net/phonet/socket.c | 7 +++---- net/rds/tcp_listen.c | 6 +++++- net/rose/af_rose.c | 6 +++--- net/sctp/socket.c | 8 ++++---- net/smc/af_smc.c | 6 +++--- net/socket.c | 13 ++++++++++--- net/tipc/socket.c | 13 +++++-------- net/unix/af_unix.c | 21 ++++++++++----------- net/vmw_vsock/af_vsock.c | 6 +++--- net/x25/af_x25.c | 4 ++-- 34 files changed, 132 insertions(+), 111 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 5bc6d0fa7498..18cfead0081d 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -407,7 +407,8 @@ unlock: return err; } -int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern) +int af_alg_accept(struct sock *sk, struct socket *newsock, + struct proto_accept_arg *arg) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; @@ -422,7 +423,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern) if (!type) goto unlock; - sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, kern); + sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, arg->kern); err = -ENOMEM; if (!sk2) goto unlock; @@ -468,10 +469,10 @@ unlock: } EXPORT_SYMBOL_GPL(af_alg_accept); -static int alg_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int alg_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { - return af_alg_accept(sock->sk, newsock, kern); + return af_alg_accept(sock->sk, newsock, arg); } static const struct proto_ops alg_proto_ops = { diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index e24c829d7a01..7c7394d46a23 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -223,8 +223,8 @@ unlock: return err ?: len; } -static int hash_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int hash_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); @@ -252,7 +252,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags, if (err) goto out_free_state; - err = af_alg_accept(ask->parent, newsock, kern); + err = af_alg_accept(ask->parent, newsock, arg); if (err) goto out_free_state; @@ -355,7 +355,7 @@ static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg, } static int hash_accept_nokey(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { int err; @@ -363,7 +363,7 @@ static int hash_accept_nokey(struct socket *sock, struct socket *newsock, if (err) return err; - return hash_accept(sock, newsock, flags, kern); + return hash_accept(sock, newsock, arg); } static struct proto_ops algif_hash_ops_nokey = { diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index d52593466a79..fd7ed65e0197 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -517,6 +517,10 @@ static void __pvcalls_back_accept(struct work_struct *work) { struct sockpass_mapping *mappass = container_of( work, struct sockpass_mapping, register_work); + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + .kern = true, + }; struct sock_mapping *map; struct pvcalls_ioworker *iow; struct pvcalls_fedata *fedata; @@ -548,7 +552,7 @@ static void __pvcalls_back_accept(struct work_struct *work) sock->type = mappass->sock->type; sock->ops = mappass->sock->ops; - ret = inet_accept(mappass->sock, sock, O_NONBLOCK, true); + ret = inet_accept(mappass->sock, sock, &arg); if (ret == -EAGAIN) { sock_release(sock); return; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 960080753d3b..2b8fa3e782fb 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1784,6 +1784,9 @@ static int o2net_accept_one(struct socket *sock, int *more) struct o2nm_node *node = NULL; struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + }; struct o2net_node *nn; unsigned int nofs_flag; @@ -1802,7 +1805,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false); + ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 78ecaf5db04c..f7b3b93f3a49 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -166,7 +166,8 @@ int af_alg_unregister_type(const struct af_alg_type *type); int af_alg_release(struct socket *sock); void af_alg_release_parent(struct sock *sk); -int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern); +int af_alg_accept(struct sock *sk, struct socket *newsock, + struct proto_accept_arg *arg); void af_alg_free_sg(struct af_alg_sgl *sgl); diff --git a/include/linux/net.h b/include/linux/net.h index 15df6d5f27a7..688320b79fcc 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -153,6 +153,7 @@ struct sockaddr; struct msghdr; struct module; struct sk_buff; +struct proto_accept_arg; typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, unsigned int, size_t); typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *); @@ -171,7 +172,8 @@ struct proto_ops { int (*socketpair)(struct socket *sock1, struct socket *sock2); int (*accept) (struct socket *sock, - struct socket *newsock, int flags, bool kern); + struct socket *newsock, + struct proto_accept_arg *arg); int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); diff --git a/include/net/inet_common.h b/include/net/inet_common.h index f50a644d87a9..c17a6585d0b0 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -29,8 +29,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); -int inet_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern); +int inet_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 20e7b0c0b3d1..7d6b1254c92d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -250,7 +250,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, return (unsigned long)min_t(u64, when, max_when); } -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern); +struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg); int inet_csk_get_port(struct sock *sk, unsigned short snum); diff --git a/include/net/sock.h b/include/net/sock.h index 0450494a1766..217079b3e3e8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1194,6 +1194,12 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) size - offsetof(struct sock, sk_node.pprev)); } +struct proto_accept_arg { + int flags; + int err; + bool kern; +}; + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ @@ -1208,8 +1214,8 @@ struct proto { int addr_len); int (*disconnect)(struct sock *sk, int flags); - struct sock * (*accept)(struct sock *sk, int flags, int *err, - bool kern); + struct sock * (*accept)(struct sock *sk, + struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); @@ -1804,7 +1810,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); -int sock_no_accept(struct socket *, struct socket *, int, bool); +int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); diff --git a/net/atm/svc.c b/net/atm/svc.c index 36a814f1fbd1..f8137ae693b0 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -324,8 +324,8 @@ out: return error; } -static int svc_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int svc_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; @@ -336,7 +336,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, lock_sock(sk); - error = svc_create(sock_net(sk), newsock, 0, kern); + error = svc_create(sock_net(sk), newsock, 0, arg->kern); if (error) goto out; @@ -355,7 +355,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, error = -sk->sk_err; break; } - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { error = -EAGAIN; break; } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 9169efb2f43a..8077cf2ee448 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1373,8 +1373,8 @@ out_release: return err; } -static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int ax25_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; @@ -1409,7 +1409,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, if (skb) break; - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ef0cc80b4c0c..2a075119d65d 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1186,7 +1186,7 @@ done: } static int iso_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; @@ -1195,7 +1195,7 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock, lock_sock(sk); - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 5cc83f906c12..125dddc77452 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -327,7 +327,7 @@ done: } static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; @@ -336,7 +336,7 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, lock_sock_nested(sk, L2CAP_NESTING_PARENT); - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 29aa07e9db9d..37d63d768afb 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -468,8 +468,8 @@ done: return err; } -static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; @@ -483,7 +483,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f goto done; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index e0ad30862ee4..94c6f2b46279 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -647,7 +647,7 @@ done: } static int sco_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; @@ -656,7 +656,7 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock, lock_sock(sk); - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); diff --git a/net/core/sock.c b/net/core/sock.c index 8d6e638b5426..8629f9aecf91 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3241,8 +3241,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) } EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return -EOPNOTSUPP; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a7bad18bc8b5..de3449e16b89 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -771,16 +771,16 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new * Accept a pending connection. The TCP layer now gives BSD semantics. */ -int inet_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int inet_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk1 = sock->sk, *sk2; - int err = -EINVAL; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ - sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); + arg->err = -EINVAL; + sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg); if (!sk2) - return err; + return arg->err; lock_sock(sk2); __inet_accept(sock, newsock, sk2); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3b38610958ee..7734d189c66b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -661,7 +661,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) /* * This will accept the next outstanding connection. */ -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) +struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; @@ -680,7 +680,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) /* Find already established connection */ if (reqsk_queue_empty(queue)) { - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; @@ -745,7 +745,7 @@ out: out_err: newsk = NULL; req = NULL; - *err = error; + arg->err = error; goto out; } EXPORT_SYMBOL(inet_csk_accept); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index c951bb9cc2e0..c3b0b610b0aa 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -795,7 +795,7 @@ done: /* Accept a pending connection */ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *nsk; @@ -809,7 +809,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, goto done; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection */ add_wait_queue_exclusive(sk_sleep(sk), &wait); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index fde1140d899e..4eb52add7103 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -688,14 +688,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. - * @flags: User specified operational flags. - * @kern: If the socket is kernel internal + * @arg: User specified arguments * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ -static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int llc_ui_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bb8f96f2b86f..815ce439183c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3881,11 +3881,10 @@ unlock: } static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; - int err; pr_debug("msk=%p", msk); @@ -3897,9 +3896,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, return -EINVAL; pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); - newsk = inet_csk_accept(ssk, flags, &err, kern); + newsk = inet_csk_accept(ssk, arg); if (!newsk) - return err; + return arg->err; pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { @@ -3920,7 +3919,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - newsk->sk_kern_sock = kern; + newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); @@ -3949,7 +3948,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, } } else { tcpfallback: - newsk->sk_kern_sock = kern; + newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); /* we are being invoked after accepting a non-mp-capable diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 104a80b75477..6ee148f0e6d0 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -772,8 +772,8 @@ out_release: return err; } -static int nr_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int nr_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; @@ -805,7 +805,7 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags, if (skb) break; - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index d5344563e525..57a2f97004e1 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -447,7 +447,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent, } static int llcp_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *new_sk; @@ -463,7 +463,7 @@ static int llcp_sock_accept(struct socket *sock, struct socket *newsock, goto error; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection. */ add_wait_queue_exclusive(sk_sleep(sk), &wait); diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 3dd5f52bc1b5..53a858478e22 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -759,8 +759,8 @@ static void pep_sock_close(struct sock *sk, long timeout) sock_put(sk); } -static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, - bool kern) +static struct sock *pep_sock_accept(struct sock *sk, + struct proto_accept_arg *arg) { struct pep_sock *pn = pep_sk(sk), *newpn; struct sock *newsk = NULL; @@ -772,8 +772,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, u8 pipe_handle, enabled, n_sb; u8 aligned = 0; - skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, - errp); + skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, + &arg->err); if (!skb) return NULL; @@ -836,7 +836,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, /* Create a new to-be-accepted sock */ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, - kern); + arg->kern); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; @@ -878,7 +878,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, drop: release_sock(sk); kfree_skb(skb); - *errp = err; + arg->err = err; return newsk; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 1018340d89a7..5ce0b3ee5def 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -292,18 +292,17 @@ out: } static int pn_socket_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; - int err; if (unlikely(sk->sk_state != TCP_LISTEN)) return -EINVAL; - newsk = sk->sk_prot->accept(sk, flags, &err, kern); + newsk = sk->sk_prot->accept(sk, arg); if (!newsk) - return err; + return arg->err; lock_sock(newsk); sock_graft(newsk, newsock); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 05008ce5c421..d89bd8d0c354 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -105,6 +105,10 @@ int rds_tcp_accept_one(struct socket *sock) int conn_state; struct rds_conn_path *cp; struct in6_addr *my_addr, *peer_addr; + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + .kern = true, + }; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; #endif @@ -119,7 +123,7 @@ int rds_tcp_accept_one(struct socket *sock) if (ret) goto out; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true); + ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ef81d019b20f..59050caab65c 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -919,8 +919,8 @@ out_release: return err; } -static int rose_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int rose_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; @@ -953,7 +953,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags, if (skb) break; - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 64196b1dce1d..c009383369b2 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4847,7 +4847,7 @@ static int sctp_disconnect(struct sock *sk, int flags) * descriptor will be returned from accept() to represent the newly * formed association. */ -static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) +static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg) { struct sctp_sock *sp; struct sctp_endpoint *ep; @@ -4871,7 +4871,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) goto out; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); error = sctp_wait_for_accept(sk, timeo); if (error) @@ -4882,7 +4882,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) */ asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); - newsk = sp->pf->create_accept_sk(sk, asoc, kern); + newsk = sp->pf->create_accept_sk(sk, asoc, arg->kern); if (!newsk) { error = -ENOMEM; goto out; @@ -4899,7 +4899,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) out: release_sock(sk); - *err = error; + arg->err = error; return newsk; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9389f0cfa374..e50a286fd0fb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2689,7 +2689,7 @@ out: } static int smc_accept(struct socket *sock, struct socket *new_sock, - int flags, bool kern) + struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); @@ -2708,7 +2708,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, } /* Wait for an incoming connection */ - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { set_current_state(TASK_INTERRUPTIBLE); @@ -2735,7 +2735,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (rc) goto out; - if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * MSEC_PER_SEC); diff --git a/net/socket.c b/net/socket.c index 01a71ae10c35..6ff5f21d9633 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1898,6 +1898,9 @@ struct file *do_accept(struct file *file, unsigned file_flags, struct file *newfile; int err, len; struct sockaddr_storage address; + struct proto_accept_arg arg = { + .flags = file_flags, + }; const struct proto_ops *ops; sock = sock_from_file(file); @@ -1926,8 +1929,8 @@ struct file *do_accept(struct file *file, unsigned file_flags, if (err) goto out_fd; - err = ops->accept(sock, newsock, sock->file->f_flags | file_flags, - false); + arg.flags |= sock->file->f_flags; + err = ops->accept(sock, newsock, &arg); if (err < 0) goto out_fd; @@ -3580,6 +3583,10 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; const struct proto_ops *ops = READ_ONCE(sock->ops); + struct proto_accept_arg arg = { + .flags = flags, + .kern = true, + }; int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, @@ -3587,7 +3594,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) if (err < 0) goto done; - err = ops->accept(sock, *newsock, flags, true); + err = ops->accept(sock, *newsock, &arg); if (err < 0) { sock_release(*newsock); *newsock = NULL; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 798397b6811e..2d58ecae4e21 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -146,8 +146,6 @@ static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); -static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, - bool kern); static void tipc_sk_timeout(struct timer_list *t); static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua); static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua); @@ -2711,13 +2709,12 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * tipc_accept - wait for connection request * @sock: listening socket * @new_sock: new socket that is to be connected - * @flags: file-related flags associated with socket - * @kern: caused by kernel or by userspace? + * @arg: arguments for accept * * Return: 0 on success, errno otherwise */ -static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, - bool kern) +static int tipc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg) { struct sock *new_sk, *sk = sock->sk; struct tipc_sock *new_tsock; @@ -2733,14 +2730,14 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, res = -EINVAL; goto exit; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); res = tipc_wait_for_accept(sock, timeo); if (res) goto exit; buf = skb_peek(&sk->sk_receive_queue); - res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern); + res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, arg->kern); if (res) goto exit; security_sk_clone(sock->sk, new_sock->sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dc1651541723..26e3b5f1ee46 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -755,7 +755,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); -static int unix_accept(struct socket *, struct socket *, int, bool); +static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, @@ -1689,19 +1689,18 @@ static void unix_sock_inherit_flags(const struct socket *old, set_bit(SOCK_PASSSEC, &new->flags); } -static int unix_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int unix_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; struct sock *tsk; - int err; - err = -EOPNOTSUPP; + arg->err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; - err = -EINVAL; + arg->err = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out; @@ -1709,12 +1708,12 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, * so that no locks are necessary. */ - skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, - &err); + skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, + &arg->err); if (!skb) { /* This means receive shutdown. */ - if (err == 0) - err = -EINVAL; + if (arg->err == 0) + arg->err = -EINVAL; goto out; } @@ -1732,7 +1731,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, return 0; out: - return err; + return arg->err; } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 54ba7316f808..4b040285aa78 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1500,8 +1500,8 @@ out: return err; } -static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int vsock_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *listener; int err; @@ -1528,7 +1528,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ - timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK); + timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK); prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); while ((connected = vsock_dequeue_accept(listener)) == NULL && diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d18d51412cc0..8dda4178497c 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -871,8 +871,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout) return rc; } -static int x25_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int x25_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; -- cgit v1.2.3-59-g8ed1b From 9841991a446c87f90f66f4b9fee6fe934c1336a2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 May 2024 22:48:35 +0900 Subject: af_unix: Update unix_sk(sk)->oob_skb under sk_receive_queue lock. Billy Jheng Bing-Jhong reported a race between __unix_gc() and queue_oob(). __unix_gc() tries to garbage-collect close()d inflight sockets, and then if the socket has MSG_OOB in unix_sk(sk)->oob_skb, GC will drop the reference and set NULL to it locklessly. However, the peer socket still can send MSG_OOB message and queue_oob() can update unix_sk(sk)->oob_skb concurrently, leading NULL pointer dereference. [0] To fix the issue, let's update unix_sk(sk)->oob_skb under the sk_receive_queue's lock and take it everywhere we touch oob_skb. Note that we defer kfree_skb() in manage_oob() to silence lockdep false-positive (See [1]). [0]: BUG: kernel NULL pointer dereference, address: 0000000000000008 PF: supervisor write access in kernel mode PF: error_code(0x0002) - not-present page PGD 8000000009f5e067 P4D 8000000009f5e067 PUD 9f5d067 PMD 0 Oops: 0002 [#1] PREEMPT SMP PTI CPU: 3 PID: 50 Comm: kworker/3:1 Not tainted 6.9.0-rc5-00191-gd091e579b864 #110 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Workqueue: events delayed_fput RIP: 0010:skb_dequeue (./include/linux/skbuff.h:2386 ./include/linux/skbuff.h:2402 net/core/skbuff.c:3847) Code: 39 e3 74 3e 8b 43 10 48 89 ef 83 e8 01 89 43 10 49 8b 44 24 08 49 c7 44 24 08 00 00 00 00 49 8b 14 24 49 c7 04 24 00 00 00 00 <48> 89 42 08 48 89 10 e8 e7 c5 42 00 4c 89 e0 5b 5d 41 5c c3 cc cc RSP: 0018:ffffc900001bfd48 EFLAGS: 00000002 RAX: 0000000000000000 RBX: ffff8880088f5ae8 RCX: 00000000361289f9 RDX: 0000000000000000 RSI: 0000000000000206 RDI: ffff8880088f5b00 RBP: ffff8880088f5b00 R08: 0000000000080000 R09: 0000000000000001 R10: 0000000000000003 R11: 0000000000000001 R12: ffff8880056b6a00 R13: ffff8880088f5280 R14: 0000000000000001 R15: ffff8880088f5a80 FS: 0000000000000000(0000) GS:ffff88807dd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000006314000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: unix_release_sock (net/unix/af_unix.c:654) unix_release (net/unix/af_unix.c:1050) __sock_release (net/socket.c:660) sock_close (net/socket.c:1423) __fput (fs/file_table.c:423) delayed_fput (fs/file_table.c:444 (discriminator 3)) process_one_work (kernel/workqueue.c:3259) worker_thread (kernel/workqueue.c:3329 kernel/workqueue.c:3416) kthread (kernel/kthread.c:388) ret_from_fork (arch/x86/kernel/process.c:153) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) Modules linked in: CR2: 0000000000000008 Link: https://lore.kernel.org/netdev/a00d3993-c461-43f2-be6d-07259c98509a@rbox.co/ [1] Fixes: 1279f9d9dec2 ("af_unix: Call kfree_skb() for dead unix_(sk)->oob_skb in GC.") Reported-by: Billy Jheng Bing-Jhong Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240516134835.8332-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fa906ec5e657..ca101690e740 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2171,13 +2171,15 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other maybe_add_creds(skb, sock, other); skb_get(skb); + scm_stat_add(other, skb); + + spin_lock(&other->sk_receive_queue.lock); if (ousk->oob_skb) consume_skb(ousk->oob_skb); - WRITE_ONCE(ousk->oob_skb, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); - scm_stat_add(other, skb); - skb_queue_tail(&other->sk_receive_queue, skb); sk_send_sigurg(other); unix_state_unlock(other); other->sk_data_ready(other); @@ -2568,8 +2570,10 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) mutex_lock(&u->iolock); unix_state_lock(sk); + spin_lock(&sk->sk_receive_queue.lock); if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { + spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); mutex_unlock(&u->iolock); return -EINVAL; @@ -2581,6 +2585,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) WRITE_ONCE(u->oob_skb, NULL); else skb_get(oob_skb); + + spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); @@ -2609,6 +2615,10 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, consume_skb(skb); skb = NULL; } else { + struct sk_buff *unlinked_skb = NULL; + + spin_lock(&sk->sk_receive_queue.lock); + if (skb == u->oob_skb) { if (copied) { skb = NULL; @@ -2620,13 +2630,19 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } else if (flags & MSG_PEEK) { skb = NULL; } else { - skb_unlink(skb, &sk->sk_receive_queue); + __skb_unlink(skb, &sk->sk_receive_queue); WRITE_ONCE(u->oob_skb, NULL); - if (!WARN_ON_ONCE(skb_unref(skb))) - kfree_skb(skb); + unlinked_skb = skb; skb = skb_peek(&sk->sk_receive_queue); } } + + spin_unlock(&sk->sk_receive_queue.lock); + + if (unlinked_skb) { + WARN_ON_ONCE(skb_unref(unlinked_skb)); + kfree_skb(unlinked_skb); + } } return skb; } -- cgit v1.2.3-59-g8ed1b