From 5648b5e1169ff1d6d6a46c35c0b5fbebd2a5cbb2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Oct 2021 18:08:10 +0200 Subject: netfilter: nfnetlink_queue: fix OOB when mac header was cleared On 64bit platforms the MAC header is set to 0xffff on allocation and also when a helper like skb_unset_mac_header() is called. dev_parse_header may call skb_mac_header() which assumes valid mac offset: BUG: KASAN: use-after-free in eth_header_parse+0x75/0x90 Read of size 6 at addr ffff8881075a5c05 by task nf-queue/1364 Call Trace: memcpy+0x20/0x60 eth_header_parse+0x75/0x90 __nfqnl_enqueue_packet+0x1a61/0x3380 __nf_queue+0x597/0x1300 nf_queue+0xf/0x40 nf_hook_slow+0xed/0x190 nf_hook+0x184/0x440 ip_output+0x1c0/0x2a0 nf_reinject+0x26f/0x700 nfqnl_recv_verdict+0xa16/0x18b0 nfnetlink_rcv_msg+0x506/0xe70 The existing code only works if the skb has a mac header. Fixes: 2c38de4c1f8da7 ("netfilter: fix looped (broad|multi)cast's MAC handling") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 4c3fbaaeb103..4acc4b8e9fe5 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -560,7 +560,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure; if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { + skb_mac_header_was_set(entskb)) { struct nfqnl_msg_packet_hw phw; int len; -- cgit v1.2.3-59-g8ed1b From 2199f562730dd1382946e0a2532afc38cd444129 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 21 Oct 2021 15:02:55 +0200 Subject: ipvs: autoload ipvs on genl access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel provides the functionality to automatically load modules providing genl families. Use this to remove the need for users to manually load the module. Signed-off-by: Thomas Weißschuh Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 29ec3ef63edc..0ff94c66641f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -48,6 +48,8 @@ #include +MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); + /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -- cgit v1.2.3-59-g8ed1b From c4777efa751d293e369aec464ce6875e957be255 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Nov 2021 17:45:55 -0700 Subject: net: add and use skb_unclone_keeptruesize() helper While commit 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") fixed immediate issues found when KFENCE was enabled/tested, there are still similar issues, when tcp_trim_head() hits KFENCE while the master skb is cloned. This happens under heavy networking TX workloads, when the TX completion might be delayed after incoming ACK. This patch fixes the WARNING in sk_stream_kill_queues when sk->sk_mem_queued/sk->sk_forward_alloc are not zero. Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Eric Dumazet Acked-by: Marco Elver Link: https://lore.kernel.org/r/20211102004555.1359210-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 ++++++++++++++++ net/core/skbuff.c | 14 +------------- net/ipv4/tcp_output.c | 6 +++--- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0bd6520329f6..a63e13082397 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1675,6 +1675,22 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } +/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + might_sleep_if(gfpflags_allow_blocking(pri)); + + if (skb_cloned(skb)) { + unsigned int save = skb->truesize; + int res; + + res = pskb_expand_head(skb, 0, 0, pri); + skb->truesize = save; + return res; + } + return 0; +} + /** * skb_header_cloned - is the header a clone * @skb: buffer to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 67a9188d8a49..3ec42cdee16a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3449,19 +3449,7 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - int ret = 0; - - if (skb_cloned(skb)) { - /* Save and restore truesize: pskb_expand_head() may reallocate - * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we - * cannot change truesize at this point. - */ - unsigned int save_truesize = skb->truesize; - - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - skb->truesize = save_truesize; - } - return ret; + return skb_unclone_keeptruesize(skb, GFP_ATOMIC); } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6fbbf1558033..76cc1641beb4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1559,7 +1559,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; } - if (skb_unclone(skb, gfp)) + if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ @@ -1667,7 +1667,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); @@ -3166,7 +3166,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); -- cgit v1.2.3-59-g8ed1b From c081d53f97a1a90a38e4296dd3d6fda5e38dca2c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:47 -0400 Subject: security: pass asoc to sctp_assoc_request and sctp_sk_clone This patch is to move secid and peer_secid from endpoint to association, and pass asoc to sctp_assoc_request and sctp_sk_clone instead of ep. As ep is the local endpoint and asoc represents a connection, and in SCTP one sk/ep could have multiple asoc/connection, saving secid/peer_secid for new asoc will overwrite the old asoc's. Note that since asoc can be passed as NULL, security_sctp_assoc_request() is moved to the place right after the new_asoc is created in sctp_sf_do_5_1B_init() and sctp_sf_do_unexpected_init(). v1->v2: - fix the description of selinux_netlbl_skbuff_setsid(), as Jakub noticed. - fix the annotation in selinux_sctp_assoc_request(), as Richard Noticed. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/security/SCTP.rst | 28 ++++++++++++++-------------- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/lsm_hooks.h | 8 ++++---- include/linux/security.h | 10 +++++----- include/net/sctp/structs.h | 20 ++++++++++---------- net/sctp/sm_statefuns.c | 26 +++++++++++++------------- net/sctp/socket.c | 5 ++--- security/security.c | 8 ++++---- security/selinux/hooks.c | 22 +++++++++++----------- security/selinux/include/netlabel.h | 4 ++-- security/selinux/netlabel.c | 18 +++++++++--------- 11 files changed, 76 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index 0bcf6c1245ee..415b548d9ce0 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -26,11 +26,11 @@ described in the `SCTP SELinux Support`_ chapter. security_sctp_assoc_request() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Passes the ``@ep`` and ``@chunk->skb`` of the association INIT packet to the +Passes the ``@asoc`` and ``@chunk->skb`` of the association INIT packet to the security module. Returns 0 on success, error on failure. :: - @ep - pointer to sctp endpoint structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of association packet. @@ -117,9 +117,9 @@ Called whenever a new socket is created by **accept**\(2) calls **sctp_peeloff**\(3). :: - @ep - pointer to current sctp endpoint structure. + @asoc - pointer to current sctp association structure. @sk - pointer to current sock structure. - @sk - pointer to new sock structure. + @newsk - pointer to new sock structure. security_inet_conn_established() @@ -200,22 +200,22 @@ hooks with the SELinux specifics expanded below:: security_sctp_assoc_request() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Passes the ``@ep`` and ``@chunk->skb`` of the association INIT packet to the +Passes the ``@asoc`` and ``@chunk->skb`` of the association INIT packet to the security module. Returns 0 on success, error on failure. :: - @ep - pointer to sctp endpoint structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of association packet. The security module performs the following operations: - IF this is the first association on ``@ep->base.sk``, then set the peer + IF this is the first association on ``@asoc->base.sk``, then set the peer sid to that in ``@skb``. This will ensure there is only one peer sid - assigned to ``@ep->base.sk`` that may support multiple associations. + assigned to ``@asoc->base.sk`` that may support multiple associations. - ELSE validate the ``@ep->base.sk peer_sid`` against the ``@skb peer sid`` + ELSE validate the ``@asoc->base.sk peer_sid`` against the ``@skb peer sid`` to determine whether the association should be allowed or denied. - Set the sctp ``@ep sid`` to socket's sid (from ``ep->base.sk``) with + Set the sctp ``@asoc sid`` to socket's sid (from ``asoc->base.sk``) with MLS portion taken from ``@skb peer sid``. This will be used by SCTP TCP style sockets and peeled off connections as they cause a new socket to be generated. @@ -259,13 +259,13 @@ security_sctp_sk_clone() Called whenever a new socket is created by **accept**\(2) (i.e. a TCP style socket) or when a socket is 'peeled off' e.g userspace calls **sctp_peeloff**\(3). ``security_sctp_sk_clone()`` will set the new -sockets sid and peer sid to that contained in the ``@ep sid`` and -``@ep peer sid`` respectively. +sockets sid and peer sid to that contained in the ``@asoc sid`` and +``@asoc peer sid`` respectively. :: - @ep - pointer to current sctp endpoint structure. + @asoc - pointer to current sctp association structure. @sk - pointer to current sock structure. - @sk - pointer to new sock structure. + @newsk - pointer to new sock structure. security_inet_conn_established() diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index a9ac70ae01ab..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -329,11 +329,11 @@ LSM_HOOK(int, 0, tun_dev_create, void) LSM_HOOK(int, 0, tun_dev_attach_queue, void *security) LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security) LSM_HOOK(int, 0, tun_dev_open, void *security) -LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_endpoint *ep, +LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc, struct sk_buff *skb) LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) -LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_endpoint *ep, +LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) #endif /* CONFIG_SECURITY_NETWORK */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0bada4df23fc..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1027,9 +1027,9 @@ * Security hooks for SCTP * * @sctp_assoc_request: - * Passes the @ep and @chunk->skb of the association INIT packet to + * Passes the @asoc and @chunk->skb of the association INIT packet to * the security module. - * @ep pointer to sctp endpoint structure. + * @asoc pointer to sctp association structure. * @skb pointer to skbuff of association packet. * Return 0 on success, error on failure. * @sctp_bind_connect: @@ -1047,9 +1047,9 @@ * Called whenever a new socket is created by accept(2) (i.e. a TCP * style socket) or when a socket is 'peeled off' e.g userspace * calls sctp_peeloff(3). - * @ep pointer to current sctp endpoint structure. + * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. - * @sk pointer to new sock structure. + * @newsk pointer to new sock structure. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index 7e0ba63b5dde..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -179,7 +179,7 @@ struct xfrm_policy; struct xfrm_state; struct xfrm_user_sec_ctx; struct seq_file; -struct sctp_endpoint; +struct sctp_association; #ifdef CONFIG_MMU extern unsigned long mmap_min_addr; @@ -1425,10 +1425,10 @@ int security_tun_dev_create(void); int security_tun_dev_attach_queue(void *security); int security_tun_dev_attach(struct sock *sk, void *security); int security_tun_dev_open(void *security); -int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb); +int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb); int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); -void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, +void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); #else /* CONFIG_SECURITY_NETWORK */ @@ -1631,7 +1631,7 @@ static inline int security_tun_dev_open(void *security) return 0; } -static inline int security_sctp_assoc_request(struct sctp_endpoint *ep, +static inline int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return 0; @@ -1644,7 +1644,7 @@ static inline int security_sctp_bind_connect(struct sock *sk, int optname, return 0; } -static inline void security_sctp_sk_clone(struct sctp_endpoint *ep, +static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 651bba654d77..899c29c326ba 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1355,16 +1355,6 @@ struct sctp_endpoint { reconf_enable:1; __u8 strreset_enable; - - /* Security identifiers from incoming (INIT). These are set by - * security_sctp_assoc_request(). These will only be used by - * SCTP TCP type sockets and peeled off connections as they - * cause a new socket to be generated. security_sctp_sk_clone() - * will then plug these into the new socket. - */ - - u32 secid; - u32 peer_secid; }; /* Recover the outter endpoint structure. */ @@ -2104,6 +2094,16 @@ struct sctp_association { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + /* Security identifiers from incoming (INIT). These are set by + * security_sctp_assoc_request(). These will only be used by + * SCTP TCP type sockets and peeled off connections as they + * cause a new socket to be generated. security_sctp_sk_clone() + * will then plug these into the new socket. + */ + + u32 secid; + u32 peer_secid; + struct rcu_head rcu; }; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index fb3da4d8f4a3..3206374209bc 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -326,11 +326,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, struct sctp_packet *packet; int len; - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* 6.10 Bundling * An endpoint MUST NOT bundle INIT, INIT ACK or * SHUTDOWN COMPLETE with any other chunks. @@ -415,6 +410,12 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, if (!new_asoc) goto nomem; + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) @@ -780,7 +781,6 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, } } - /* Delay state machine commands until later. * * Re-build the bind address for the association is done in @@ -1517,11 +1517,6 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( struct sctp_packet *packet; int len; - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* 6.10 Bundling * An endpoint MUST NOT bundle INIT, INIT ACK or * SHUTDOWN COMPLETE with any other chunks. @@ -1594,6 +1589,12 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( if (!new_asoc) goto nomem; + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) goto nomem; @@ -2255,8 +2256,7 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( } /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) { + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { sctp_association_free(new_asoc); return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6b937bfd4751..33391254fa82 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9412,7 +9412,6 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; struct sctp_sock *sp = sctp_sk(sk); - struct sctp_endpoint *ep = sp->ep; newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; @@ -9457,9 +9456,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, net_enable_timestamp(); /* Set newsk security attributes from original sk and connection - * security attribute from ep. + * security attribute from asoc. */ - security_sctp_sk_clone(ep, sk, newsk); + security_sctp_sk_clone(asoc, sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, diff --git a/security/security.c b/security/security.c index 95e30fadba78..c88167a414b4 100644 --- a/security/security.c +++ b/security/security.c @@ -2367,9 +2367,9 @@ int security_tun_dev_open(void *security) } EXPORT_SYMBOL(security_tun_dev_open); -int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb) +int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { - return call_int_hook(sctp_assoc_request, 0, ep, skb); + return call_int_hook(sctp_assoc_request, 0, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); @@ -2381,10 +2381,10 @@ int security_sctp_bind_connect(struct sock *sk, int optname, } EXPORT_SYMBOL(security_sctp_bind_connect); -void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, +void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { - call_void_hook(sctp_sk_clone, ep, sk, newsk); + call_void_hook(sctp_sk_clone, asoc, sk, newsk); } EXPORT_SYMBOL(security_sctp_sk_clone); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ea7b2876a5ae..62d30c0a30c2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5339,10 +5339,10 @@ static void selinux_sock_graft(struct sock *sk, struct socket *parent) * connect(2), sctp_connectx(3) or sctp_sendmsg(3) (with no association * already present). */ -static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, +static int selinux_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { - struct sk_security_struct *sksec = ep->base.sk->sk_security; + struct sk_security_struct *sksec = asoc->base.sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; u8 peerlbl_active; @@ -5359,7 +5359,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, /* This will return peer_sid = SECSID_NULL if there are * no peer labels, see security_net_peersid_resolve(). */ - err = selinux_skb_peerlbl_sid(skb, ep->base.sk->sk_family, + err = selinux_skb_peerlbl_sid(skb, asoc->base.sk->sk_family, &peer_sid); if (err) return err; @@ -5383,7 +5383,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, */ ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; - ad.u.net->sk = ep->base.sk; + ad.u.net->sk = asoc->base.sk; err = avc_has_perm(&selinux_state, sksec->peer_sid, peer_sid, sksec->sclass, SCTP_SOCKET__ASSOCIATION, &ad); @@ -5392,7 +5392,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, } /* Compute the MLS component for the connection and store - * the information in ep. This will be used by SCTP TCP type + * the information in asoc. This will be used by SCTP TCP type * sockets and peeled off connections as they cause a new * socket to be generated. selinux_sctp_sk_clone() will then * plug this into the new socket. @@ -5401,11 +5401,11 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, if (err) return err; - ep->secid = conn_sid; - ep->peer_secid = peer_sid; + asoc->secid = conn_sid; + asoc->peer_secid = peer_sid; /* Set any NetLabel labels including CIPSO/CALIPSO options. */ - return selinux_netlbl_sctp_assoc_request(ep, skb); + return selinux_netlbl_sctp_assoc_request(asoc, skb); } /* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting @@ -5490,7 +5490,7 @@ static int selinux_sctp_bind_connect(struct sock *sk, int optname, } /* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */ -static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, +static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = sk->sk_security; @@ -5502,8 +5502,8 @@ static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); - newsksec->sid = ep->secid; - newsksec->peer_sid = ep->peer_secid; + newsksec->sid = asoc->secid; + newsksec->peer_sid = asoc->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); } diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h index 0c58f62dc6ab..4d0456d3d459 100644 --- a/security/selinux/include/netlabel.h +++ b/security/selinux/include/netlabel.h @@ -39,7 +39,7 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, int selinux_netlbl_skbuff_setsid(struct sk_buff *skb, u16 family, u32 sid); -int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, +int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb); int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family); void selinux_netlbl_inet_csk_clone(struct sock *sk, u16 family); @@ -98,7 +98,7 @@ static inline int selinux_netlbl_skbuff_setsid(struct sk_buff *skb, return 0; } -static inline int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, +static inline int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return 0; diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 29b88e81869b..1321f15799e2 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -261,30 +261,30 @@ skbuff_setsid_return: /** * selinux_netlbl_sctp_assoc_request - Label an incoming sctp association. - * @ep: incoming association endpoint. + * @asoc: incoming association. * @skb: the packet. * * Description: - * A new incoming connection is represented by @ep, ...... + * A new incoming connection is represented by @asoc, ...... * Returns zero on success, negative values on failure. * */ -int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, +int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { int rc; struct netlbl_lsm_secattr secattr; - struct sk_security_struct *sksec = ep->base.sk->sk_security; + struct sk_security_struct *sksec = asoc->base.sk->sk_security; struct sockaddr_in addr4; struct sockaddr_in6 addr6; - if (ep->base.sk->sk_family != PF_INET && - ep->base.sk->sk_family != PF_INET6) + if (asoc->base.sk->sk_family != PF_INET && + asoc->base.sk->sk_family != PF_INET6) return 0; netlbl_secattr_init(&secattr); rc = security_netlbl_sid_to_secattr(&selinux_state, - ep->secid, &secattr); + asoc->secid, &secattr); if (rc != 0) goto assoc_request_return; @@ -294,11 +294,11 @@ int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, if (ip_hdr(skb)->version == 4) { addr4.sin_family = AF_INET; addr4.sin_addr.s_addr = ip_hdr(skb)->saddr; - rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr4, &secattr); + rc = netlbl_conn_setattr(asoc->base.sk, (void *)&addr4, &secattr); } else if (IS_ENABLED(CONFIG_IPV6) && ip_hdr(skb)->version == 6) { addr6.sin6_family = AF_INET6; addr6.sin6_addr = ipv6_hdr(skb)->saddr; - rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr6, &secattr); + rc = netlbl_conn_setattr(asoc->base.sk, (void *)&addr6, &secattr); } else { rc = -EAFNOSUPPORT; } -- cgit v1.2.3-59-g8ed1b From e215dab1c49070cd75620afd801f777207a5b65c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:48 -0400 Subject: security: call security_sctp_assoc_request in sctp_sf_do_5_1D_ce The asoc created when receives the INIT chunk is a temporary one, it will be deleted after INIT_ACK chunk is replied. So for the real asoc created in sctp_sf_do_5_1D_ce() when the COOKIE_ECHO chunk is received, security_sctp_assoc_request() should also be called. v1->v2: - fix some typo and grammar errors, noticed by Ondrej. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/security/SCTP.rst | 15 +++++++++------ net/sctp/sm_statefuns.c | 5 +++++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index 415b548d9ce0..d5fd6ccc3dcb 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -151,9 +151,9 @@ establishing an association. INIT ---------------------------------------------> sctp_sf_do_5_1B_init() Respond to an INIT chunk. - SCTP peer endpoint "A" is - asking for an association. Call - security_sctp_assoc_request() + SCTP peer endpoint "A" is asking + for a temporary association. + Call security_sctp_assoc_request() to set the peer label if first association. If not first association, check @@ -163,9 +163,12 @@ establishing an association. | discard the packet. | COOKIE ECHO ------------------------------------------> - | - | - | + sctp_sf_do_5_1D_ce() + Respond to an COOKIE ECHO chunk. + Confirm the cookie and create a + permanent association. + Call security_sctp_assoc_request() to + do the same as for INIT chunk Response. <------------------------------------------- COOKIE ACK | | sctp_sf_do_5_1E_ca | diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3206374209bc..b818532c3fc2 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -781,6 +781,11 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, } } + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + /* Delay state machine commands until later. * * Re-build the bind address for the association is done in -- cgit v1.2.3-59-g8ed1b From 7c2ef0240e6abfd3cc59511339517358350a8910 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:49 -0400 Subject: security: add sctp_assoc_established hook security_sctp_assoc_established() is added to replace security_inet_conn_established() called in sctp_sf_do_5_1E_ca(), so that asoc can be accessed in security subsystem and save the peer secid to asoc->peer_secid. v1->v2: - fix the return value of security_sctp_assoc_established() in security.h, found by kernel test robot and Ondrej. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/security/SCTP.rst | 22 ++++++++++------------ include/linux/lsm_hook_defs.h | 2 ++ include/linux/lsm_hooks.h | 5 +++++ include/linux/security.h | 7 +++++++ net/sctp/sm_statefuns.c | 2 +- security/security.c | 7 +++++++ 6 files changed, 32 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index d5fd6ccc3dcb..406cc68b8808 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -15,10 +15,7 @@ For security module support, three SCTP specific hooks have been implemented:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - -Also the following security hook has been utilised:: - - security_inet_conn_established() + security_sctp_assoc_established() The usage of these hooks are described below with the SELinux implementation described in the `SCTP SELinux Support`_ chapter. @@ -122,11 +119,12 @@ calls **sctp_peeloff**\(3). @newsk - pointer to new sock structure. -security_inet_conn_established() +security_sctp_assoc_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Called when a COOKIE ACK is received:: +Called when a COOKIE ACK is received, and the peer secid will be +saved into ``@asoc->peer_secid`` for client:: - @sk - pointer to sock structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of the COOKIE ACK packet. @@ -134,7 +132,7 @@ Security Hooks used for Association Establishment ------------------------------------------------- The following diagram shows the use of ``security_sctp_bind_connect()``, -``security_sctp_assoc_request()``, ``security_inet_conn_established()`` when +``security_sctp_assoc_request()``, ``security_sctp_assoc_established()`` when establishing an association. :: @@ -172,7 +170,7 @@ establishing an association. <------------------------------------------- COOKIE ACK | | sctp_sf_do_5_1E_ca | - Call security_inet_conn_established() | + Call security_sctp_assoc_established() | to set the peer label. | | | | If SCTP_SOCKET_TCP or peeled off @@ -198,7 +196,7 @@ hooks with the SELinux specifics expanded below:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - security_inet_conn_established() + security_sctp_assoc_established() security_sctp_assoc_request() @@ -271,12 +269,12 @@ sockets sid and peer sid to that contained in the ``@asoc sid`` and @newsk - pointer to new sock structure. -security_inet_conn_established() +security_sctp_assoc_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Called when a COOKIE ACK is received where it sets the connection's peer sid to that in ``@skb``:: - @sk - pointer to sock structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of the COOKIE ACK packet. diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index df8de62f4710..442a611fa0fb 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,6 +335,8 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) +LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, + struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d45b6f6e27fd..d6823214d5c1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,6 +1050,11 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. + * @sctp_assoc_established: + * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet + * to the security module. + * @asoc pointer to sctp association structure. + * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index bbf44a466832..06eac4e61a13 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,6 +1430,8 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); +void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1649,6 +1651,11 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } + +static inline void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb) +{ +} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index b818532c3fc2..5fabaa54b77d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -946,7 +946,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL()); /* Set peer label for connection. */ - security_inet_conn_established(ep->base.sk, chunk->skb); + security_sctp_assoc_established((struct sctp_association *)asoc, chunk->skb); /* RFC 2960 5.1 Normal Establishment of an Association * diff --git a/security/security.c b/security/security.c index c88167a414b4..779a9edea0a0 100644 --- a/security/security.c +++ b/security/security.c @@ -2388,6 +2388,13 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, } EXPORT_SYMBOL(security_sctp_sk_clone); +void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb) +{ + call_void_hook(sctp_assoc_established, asoc, skb); +} +EXPORT_SYMBOL(security_sctp_assoc_established); + #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND -- cgit v1.2.3-59-g8ed1b From aedddb4e45b34426cfbfa84454b6f203712733c5 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 2 Nov 2021 16:10:21 +0800 Subject: NFC: add necessary privilege flags in netlink layer The CAP_NET_ADMIN checks are needed to prevent attackers faking a device under NCIUARTSETDRIVER and exploit privileged commands. This patch add GENL_ADMIN_PERM flags in genl_ops to fulfill the check. Except for commands like NFC_CMD_GET_DEVICE, NFC_CMD_GET_TARGET, NFC_CMD_LLC_GET_PARAMS, and NFC_CMD_GET_SE, which are mainly information- read operations. Signed-off-by: Lin Ma Signed-off-by: David S. Miller --- net/nfc/netlink.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 49089c50872e..334f63c9529e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1664,31 +1664,37 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, @@ -1706,26 +1712,31 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, @@ -1737,21 +1748,25 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, + .flags = GENL_ADMIN_PERM, }, }; -- cgit v1.2.3-59-g8ed1b From acaea0d5a63406c052444ad3a7cb54241adaf805 Mon Sep 17 00:00:00 2001 From: Zhang Mingyu Date: Wed, 3 Nov 2021 06:46:17 +0000 Subject: net:ipv6:Remove unneeded semicolon Eliminate the following coccinelle check warning: net/ipv6/seg6.c:381:2-3 Reported-by: Zeal Robot Signed-off-by: Zhang Mingyu Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 5daa1c3ed83b..a8b5784afb1a 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -378,7 +378,7 @@ static int __net_init seg6_net_init(struct net *net) kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); return -ENOMEM; - }; + } #endif return 0; -- cgit v1.2.3-59-g8ed1b From 9b65b17db72313b7a4fe9bc9502928c88be57986 Mon Sep 17 00:00:00 2001 From: Talal Ahmad Date: Tue, 2 Nov 2021 22:58:44 -0400 Subject: net: avoid double accounting for pure zerocopy skbs Track skbs containing only zerocopy data and avoid charging them to kernel memory to correctly account the memory utilization for msg_zerocopy. All of the data in such skbs is held in user pages which are already accounted to user. Before this change, they are charged again in kernel in __zerocopy_sg_from_iter. The charging in kernel is excessive because data is not being copied into skb frags. This excessive charging can lead to kernel going into memory pressure state which impacts all sockets in the system adversely. Mark pure zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove charge/uncharge for data in such skbs. Initially, an skb is marked pure zerocopy when it is empty and in zerocopy path. skb can then change from a pure zerocopy skb to mixed data skb (zerocopy and copy data) if it is at tail of write queue and there is room available in it and non-zerocopy data is being sent in the next sendmsg call. At this time sk_mem_charge is done for the pure zerocopied data and the pure zerocopy flag is unmarked. We found that this happens very rarely on workloads that pass MSG_ZEROCOPY. A pure zerocopy skb can later be coalesced into normal skb if they are next to each other in queue but this patch prevents coalescing from happening. This avoids complexity of charging when skb downgrades from pure zerocopy to mixed. This is also rare. In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge for SKB_TRUESIZE(skb_end_offset(skb)) is done for sk_mem_charge in tcp_skb_entail for an skb without data. Testing with the msg_zerocopy.c benchmark between two hosts(100G nics) with zerocopy showed that before this patch the 'sock' variable in memory.stat for cgroup2 that tracks sum of sk_forward_alloc, sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this change it is 0. This is due to no charge to sk_forward_alloc for zerocopy data and shows memory utilization for kernel is lowered. With this commit we don't see the warning we saw in previous commit which resulted in commit 84882cf72cd774cf16fd338bdbf00f69ac9f9194. Signed-off-by: Talal Ahmad Acked-by: Arjun Roy Acked-by: Soheil Hassas Yeganeh Signed-off-by: Willem de Bruijn Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 19 ++++++++++++++++++- include/net/tcp.h | 8 ++++++-- net/core/datagram.c | 3 ++- net/core/skbuff.c | 3 ++- net/ipv4/tcp.c | 22 ++++++++++++++++++++-- net/ipv4/tcp_output.c | 7 +++++-- 6 files changed, 53 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a63e13082397..686a666d073d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -454,9 +454,15 @@ enum { * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), + + /* segment contains only zerocopy data and should not be + * charged to the kernel memory. + */ + SKBFL_PURE_ZEROCOPY = BIT(2), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } +static inline bool skb_zcopy_pure(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; +} + +static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ + return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); +} + static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); @@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } diff --git a/include/net/tcp.h b/include/net/tcp.h index 70972f3ac8fa..4da22b41bde6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -293,7 +293,10 @@ static inline bool tcp_out_of_memory(struct sock *sk) static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); - sk_mem_uncharge(sk, skb->truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, skb->truesize); + else + sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); __kfree_skb(skb); } @@ -974,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && - mptcp_skb_can_collapse(to, from)); + mptcp_skb_can_collapse(to, from) && + skb_pure_zcopy_same(to, from)); } /* Events passed to congestion control interface */ diff --git a/net/core/datagram.c b/net/core/datagram.c index 15ab9ffb27fe..ee290776c661 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); - sk_mem_charge(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ec42cdee16a..ba2f38246f07 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; - skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc7f419184aa..b461ae573afc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(skb)) { bool mem_scheduled; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); @@ -1319,6 +1320,15 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); + /* skb changing from pure zc to mixed, must charge zc */ + if (unlikely(skb_zcopy_pure(skb))) { + if (!sk_wmem_schedule(sk, skb->data_len)) + goto wait_for_space; + + sk_mem_charge(sk, skb->data_len); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + if (!sk_wmem_schedule(sk, copy)) goto wait_for_space; @@ -1339,8 +1349,16 @@ new_segment: } pfrag->offset += copy; } else { - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_space; + /* First append to a fragless skb builds initial + * pure zerocopy skb + */ + if (!skb->len) + skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; + + if (!skb_zcopy_pure(skb)) { + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_space; + } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 76cc1641beb4..6f7860e283c6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); - sk_mem_uncharge(sk, delta_truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) + if (unlikely(TCP_SKB_CB(skb)->eor) || + tcp_has_tx_tstamp(skb) || + !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; -- cgit v1.2.3-59-g8ed1b From 1aabe578dd86e9f2867c4db4fba9a15f4ba1825d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Nov 2021 15:02:36 -0700 Subject: ethtool: fix ethtool msg len calculation for pause stats ETHTOOL_A_PAUSE_STAT_MAX is the MAX attribute id, so we need to subtract non-stats and add one to get a count (IOW -2+1 == -1). Otherwise we'll see: ethnl cmd 21: calculated reply length 40, but consumed 52 Fixes: 9a27a33027f2 ("ethtool: add standard pause stats") Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 3 +++ include/uapi/linux/ethtool_netlink.h | 4 +++- net/ethtool/pause.c | 3 +-- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 1e7bf78cb382..aba348d58ff6 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -10,6 +10,9 @@ #define __ETHTOOL_LINK_MODE_MASK_NWORDS \ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) +#define ETHTOOL_PAUSE_STAT_CNT (__ETHTOOL_A_PAUSE_STAT_CNT - \ + ETHTOOL_A_PAUSE_STAT_TX_FRAMES) + enum ethtool_multicast_groups { ETHNL_MCGRP_MONITOR, }; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index ca5fbb59fa42..999777d32dcf 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -411,7 +411,9 @@ enum { ETHTOOL_A_PAUSE_STAT_TX_FRAMES, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, - /* add new constants above here */ + /* add new constants above here + * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! + */ __ETHTOOL_A_PAUSE_STAT_CNT, ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 9009f412151e..ee1e5806bc93 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -56,8 +56,7 @@ static int pause_reply_size(const struct ethnl_req_info *req_base, if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ - nla_total_size_64bit(sizeof(u64)) * - (ETHTOOL_A_PAUSE_STAT_MAX - 2); + nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } -- cgit v1.2.3-59-g8ed1b From 250962e4684678629afd2feeaefdc40c5db501f4 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 3 Nov 2021 16:28:43 +0800 Subject: net: udp6: replace __UDP_INC_STATS() with __UDP6_INC_STATS() __UDP_INC_STATS() is used in udpv6_queue_rcv_one_skb() when encap_rcv() fails. __UDP6_INC_STATS() should be used here, so replace it with __UDP6_INC_STATS(). Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- net/ipv6/udp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 12c12619ee35..e43b31d25fb6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -700,9 +700,9 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - __UDP_INC_STATS(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP6_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } -- cgit v1.2.3-59-g8ed1b From 563bcbae3ba233c275c244bfce2efe12938f5363 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Tue, 2 Nov 2021 10:12:18 +0800 Subject: net: vlan: fix a UAF in vlan_dev_real_dev() The real_dev of a vlan net_device may be freed after unregister_vlan_dev(). Access the real_dev continually by vlan_dev_real_dev() will trigger the UAF problem for the real_dev like following: ================================================================== BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120 Call Trace: kasan_report.cold+0x83/0xdf vlan_dev_real_dev+0xf9/0x120 is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0 is_eth_port_of_netdev_filter+0x28/0x40 ib_enum_roce_netdev+0x1a3/0x300 ib_enum_all_roce_netdevs+0xc7/0x140 netdevice_event_work_handler+0x9d/0x210 ... Freed by task 9288: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0xfc/0x130 slab_free_freelist_hook+0xdd/0x240 kfree+0xe4/0x690 kvfree+0x42/0x50 device_release+0x9f/0x240 kobject_put+0x1c8/0x530 put_device+0x1b/0x30 free_netdev+0x370/0x540 ppp_destroy_interface+0x313/0x3d0 ... Move the put_device(real_dev) to vlan_dev_free(). Ensure real_dev not be freed before vlan_dev unregistered. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com Signed-off-by: Ziyang Xuan Reviewed-by: Jason Gunthorpe Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 --- net/8021q/vlan_dev.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 55275ef9a31a..a3a0a5e994f5 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -123,9 +123,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); - - /* Get rid of the vlan's reference to real_dev */ - dev_put(real_dev); } int vlan_check_real_dev(struct net_device *real_dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 90330b893134..ab6dee28536d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -843,6 +843,9 @@ static void vlan_dev_free(struct net_device *dev) free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; + + /* Get rid of the vlan's reference to real_dev */ + dev_put(vlan->real_dev); } void vlan_setup(struct net_device *dev) -- cgit v1.2.3-59-g8ed1b From 92f62485b3715882cd397b0cbd80a96d179b86d6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 2 Nov 2021 21:31:22 +0200 Subject: net: dsa: felix: fix broken VLAN-tagged PTP under VLAN-aware bridge Normally it is expected that the dsa_device_ops :: rcv() method finishes parsing the DSA tag and consumes it, then never looks at it again. But commit c0bcf537667c ("net: dsa: ocelot: add hardware timestamping support for Felix") added support for RX timestamping in a very unconventional way. On this switch, a partial timestamp is available in the DSA header, but the driver got away with not parsing that timestamp right away, but instead delayed that parsing for a little longer: dsa_switch_rcv(): nskb = cpu_dp->rcv(skb, dev); <------------- not here -> ocelot_rcv() ... skb = nskb; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); ... if (dsa_skb_defer_rx_timestamp(p, skb)) <--- but here -> felix_rxtstamp() return 0; When in felix_rxtstamp(), this driver accounted for the fact that eth_type_trans() happened in the meanwhile, so it got a hold of the extraction header again by subtracting (ETH_HLEN + OCELOT_TAG_LEN) bytes from the current skb->data. This worked for quite some time but was quite fragile from the very beginning. Not to mention that having DSA tag parsing split in two different files, under different folders (net/dsa/tag_ocelot.c vs drivers/net/dsa/ocelot/felix.c) made it quite non-obvious for patches to come that they might break this. Finally, the blamed commit does the following: at the end of ocelot_rcv(), it checks whether the skb payload contains a VLAN header. If it does, and this port is under a VLAN-aware bridge, that VLAN ID might not be correct in the sense that the packet might have suffered VLAN rewriting due to TCAM rules (VCAP IS1). So we consume the VLAN ID from the skb payload using __skb_vlan_pop(), and take the classified VLAN ID from the DSA tag, and construct a hwaccel VLAN tag with the classified VLAN, and the skb payload is VLAN-untagged. The big problem is that __skb_vlan_pop() does: memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); __skb_pull(skb, VLAN_HLEN); aka it moves the Ethernet header 4 bytes to the right, and pulls 4 bytes from the skb headroom (effectively also moving skb->data, by definition). So for felix_rxtstamp()'s fragile logic, all bets are off now. Instead of having the "extraction" pointer point to the DSA header, it actually points to 4 bytes _inside_ the extraction header. Corollary, the last 4 bytes of the "extraction" header are in fact 4 stale bytes of the destination MAC address from the Ethernet header, from prior to the __skb_vlan_pop() movement. So of course, RX timestamps are completely bogus when the system is configured in this way. The fix is actually very simple: just don't structure the code like that. For better or worse, the DSA PTP timestamping API does not offer a straightforward way for drivers to present their RX timestamps, but other drivers (sja1105) have established a simple mechanism to carry their RX timestamp from dsa_device_ops :: rcv() all the way to dsa_switch_ops :: port_rxtstamp() and even later. That mechanism is to simply save the partial timestamp to the skb->cb, and complete it later. Question: why don't we simply populate the skb's struct skb_shared_hwtstamps from ocelot_rcv(), and bother with this complication of propagating the timestamp to felix_rxtstamp()? Answer: dsa_switch_ops :: port_rxtstamp() answers the question whether PTP packets need sleepable context to retrieve the full RX timestamp. Currently felix_rxtstamp() answers "no, thanks" to that question, and calls ocelot_ptp_gettime64() from softirq atomic context. This is understandable, since Felix VSC9959 is a PCIe memory-mapped switch, so hardware access does not require sleeping. But the felix driver is preparing for the introduction of other switches where hardware access is over a slow bus like SPI or MDIO: https://lore.kernel.org/lkml/20210814025003.2449143-1-colin.foster@in-advantage.com/ So I would like to keep this code structure, so the rework needed when that driver will need PTP support will be minimal (answer "yes, I need deferred context for this skb's RX timestamp", then the partial timestamp will still be found in the skb->cb. Fixes: ea440cd2d9b2 ("net: dsa: tag_ocelot: use VLAN information from tagging header when available") Reported-by: Po Liu Cc: Yangbo Lu Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 9 +++------ include/linux/dsa/ocelot.h | 1 + net/dsa/tag_ocelot.c | 3 +++ 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 83808e7dbdda..327cc4654806 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1370,12 +1370,12 @@ out: static bool felix_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type) { - u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN; + u32 tstamp_lo = OCELOT_SKB_CB(skb)->tstamp_lo; struct skb_shared_hwtstamps *shhwtstamps; struct ocelot *ocelot = ds->priv; - u32 tstamp_lo, tstamp_hi; struct timespec64 ts; - u64 tstamp, val; + u32 tstamp_hi; + u64 tstamp; /* If the "no XTR IRQ" workaround is in use, tell DSA to defer this skb * for RX timestamping. Then free it, and poll for its copy through @@ -1390,9 +1390,6 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port, ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); tstamp = ktime_set(ts.tv_sec, ts.tv_nsec); - ocelot_xfh_get_rew_val(extraction, &val); - tstamp_lo = (u32)val; - tstamp_hi = tstamp >> 32; if ((tstamp & 0xffffffff) < tstamp_lo) tstamp_hi--; diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index d42010cf5468..7ee708ad7df2 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -12,6 +12,7 @@ struct ocelot_skb_cb { struct sk_buff *clone; unsigned int ptp_class; /* valid only for clones */ + u32 tstamp_lo; u8 ptp_cmd; u8 ts_id; }; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index cd60b94fc175..de1c849a0a70 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -101,6 +101,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct dsa_port *dp; u8 *extraction; u16 vlan_tpid; + u64 rew_val; /* Revert skb->data by the amount consumed by the DSA master, * so it points to the beginning of the frame. @@ -130,6 +131,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, ocelot_xfh_get_qos_class(extraction, &qos_class); ocelot_xfh_get_tag_type(extraction, &tag_type); ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); + ocelot_xfh_get_rew_val(extraction, &rew_val); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) @@ -143,6 +145,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, dsa_default_offload_fwd_mark(skb); skb->priority = qos_class; + OCELOT_SKB_CB(skb)->tstamp_lo = rew_val; /* Ocelot switches copy frames unmodified to the CPU. However, it is * possible for the user to request a VLAN modification through -- cgit v1.2.3-59-g8ed1b From 3b65abb8d8a650e50ff5448ac38992ef8a74c584 Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Thu, 4 Nov 2021 00:17:51 +0200 Subject: tcp: Use BIT() for OPTION_* constants Extending these flags using the existing (1 << x) pattern triggers complaints from checkpatch. Instead of ignoring checkpatch modify the existing values to use BIT(x) style in a separate commit. Signed-off-by: Leonard Crestez Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6f7860e283c6..2e6e5a70168e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -408,13 +408,13 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) return tp->snd_una != tp->snd_up; } -#define OPTION_SACK_ADVERTISE (1 << 0) -#define OPTION_TS (1 << 1) -#define OPTION_MD5 (1 << 2) -#define OPTION_WSCALE (1 << 3) -#define OPTION_FAST_OPEN_COOKIE (1 << 8) -#define OPTION_SMC (1 << 9) -#define OPTION_MPTCP (1 << 10) +#define OPTION_SACK_ADVERTISE BIT(0) +#define OPTION_TS BIT(1) +#define OPTION_MD5 BIT(2) +#define OPTION_WSCALE BIT(3) +#define OPTION_FAST_OPEN_COOKIE BIT(8) +#define OPTION_SMC BIT(9) +#define OPTION_MPTCP BIT(10) static void smc_options_write(__be32 *ptr, u16 *options) { -- cgit v1.2.3-59-g8ed1b From d00c8ee31729248ba40b4ab25cd3b3b580c6f87c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Nov 2021 16:49:11 -0700 Subject: net: fix possible NULL deref in sock_reserve_memory Sanity check in sock_reserve_memory() was not enough to prevent malicious user to trigger a NULL deref. In this case, the isse is that sk_prot->memory_allocated is NULL. Use standard sk_has_account() helper to deal with this. BUG: KASAN: null-ptr-deref in instrument_atomic_read_write include/linux/instrumented.h:101 [inline] BUG: KASAN: null-ptr-deref in atomic_long_add_return include/linux/atomic/atomic-instrumented.h:1218 [inline] BUG: KASAN: null-ptr-deref in sk_memory_allocated_add include/net/sock.h:1371 [inline] BUG: KASAN: null-ptr-deref in sock_reserve_memory net/core/sock.c:994 [inline] BUG: KASAN: null-ptr-deref in sock_setsockopt+0x22ab/0x2b30 net/core/sock.c:1443 Write of size 8 at addr 0000000000000000 by task syz-executor.0/11270 CPU: 1 PID: 11270 Comm: syz-executor.0 Not tainted 5.15.0-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 __kasan_report mm/kasan/report.c:446 [inline] kasan_report.cold+0x66/0xdf mm/kasan/report.c:459 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189 instrument_atomic_read_write include/linux/instrumented.h:101 [inline] atomic_long_add_return include/linux/atomic/atomic-instrumented.h:1218 [inline] sk_memory_allocated_add include/net/sock.h:1371 [inline] sock_reserve_memory net/core/sock.c:994 [inline] sock_setsockopt+0x22ab/0x2b30 net/core/sock.c:1443 __sys_setsockopt+0x4f8/0x610 net/socket.c:2172 __do_sys_setsockopt net/socket.c:2187 [inline] __se_sys_setsockopt net/socket.c:2184 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2184 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f56076d5ae9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f5604c4b188 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007f56077e8f60 RCX: 00007f56076d5ae9 RDX: 0000000000000049 RSI: 0000000000000001 RDI: 0000000000000003 RBP: 00007f560772ff25 R08: 000000000000fec7 R09: 0000000000000000 R10: 0000000020000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fffb61a100f R14: 00007f5604c4b300 R15: 0000000000022000 Fixes: 2bb2f5fb21b0 ("net: add new socket option SO_RESERVE_MEM") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 9862eefce21e..8f2b2f2c0e7b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -976,7 +976,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg) + if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) -- cgit v1.2.3-59-g8ed1b From 96d0c9be432dfd4908e96dde7cab860368a348ab Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Wed, 3 Nov 2021 20:16:06 +0800 Subject: devlink: fix flexible_array.cocci warning Fix following coccicheck warning: ./net/core/devlink.c:69:6-10: WARNING use flexible-array member instead Signed-off-by: Guo Zhengkui Link: https://lore.kernel.org/r/20211103121607.27490-1-guozhengkui@vivo.com Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 6b5ee862429e..5ba4f9434acd 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -66,7 +66,7 @@ struct devlink { u8 reload_failed:1; refcount_t refcount; struct completion comp; - char priv[0] __aligned(NETDEV_ALIGN); + char priv[] __aligned(NETDEV_ALIGN); }; void *devlink_priv(struct devlink *devlink) -- cgit v1.2.3-59-g8ed1b From 1e4b50f06d970d8da3474d2a0354450416710bda Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Wed, 3 Nov 2021 20:09:42 +0100 Subject: mctp: handle the struct sockaddr_mctp padding fields In order to have the padding fields actually usable in the future, there have to be checks that user space doesn't supply non-zero garbage there. It is also worth setting these padding fields to zero, unless it is known that they have been already zeroed. Cc: stable@vger.kernel.org # v5.15 Fixes: 5a20dd46b8b84593 ("mctp: Be explicit about struct sockaddr_mctp padding") Signed-off-by: Eugene Syromiatnikov Acked-by: Jeremy Kerr Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index d344b02a1cde..bc88159f8844 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -33,6 +33,12 @@ static int mctp_release(struct socket *sock) return 0; } +/* Generic sockaddr checks, padding checks only so far */ +static bool mctp_sockaddr_is_ok(const struct sockaddr_mctp *addr) +{ + return !addr->__smctp_pad0 && !addr->__smctp_pad1; +} + static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { struct sock *sk = sock->sk; @@ -52,6 +58,9 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) /* it's a valid sockaddr for MCTP, cast and do protocol checks */ smctp = (struct sockaddr_mctp *)addr; + if (!mctp_sockaddr_is_ok(smctp)) + return -EINVAL; + lock_sock(sk); /* TODO: allow rebind */ @@ -87,6 +96,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) return -EINVAL; if (addr->smctp_family != AF_MCTP) return -EINVAL; + if (!mctp_sockaddr_is_ok(addr)) + return -EINVAL; if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER)) return -EINVAL; @@ -198,11 +209,13 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, addr = msg->msg_name; addr->smctp_family = AF_MCTP; + addr->__smctp_pad0 = 0; addr->smctp_network = cb->net; addr->smctp_addr.s_addr = hdr->src; addr->smctp_type = type; addr->smctp_tag = hdr->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + addr->__smctp_pad1 = 0; msg->msg_namelen = sizeof(*addr); if (msk->addr_ext) { -- cgit v1.2.3-59-g8ed1b From e9ea574ec1c27e555e7f78cbbcd28af91889d529 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Wed, 3 Nov 2021 20:09:46 +0100 Subject: mctp: handle the struct sockaddr_mctp_ext padding field struct sockaddr_mctp_ext.__smctp_paddin0 has to be checked for being set to zero, otherwise it cannot be utilised in the future. Fixes: 99ce45d5e7dbde39 ("mctp: Implement extended addressing") Signed-off-by: Eugene Syromiatnikov Acked-by: Jeremy Kerr Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index bc88159f8844..871cf6266125 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -39,6 +39,13 @@ static bool mctp_sockaddr_is_ok(const struct sockaddr_mctp *addr) return !addr->__smctp_pad0 && !addr->__smctp_pad1; } +static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr) +{ + return !addr->__smctp_pad0[0] && + !addr->__smctp_pad0[1] && + !addr->__smctp_pad0[2]; +} + static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { struct sock *sk = sock->sk; @@ -135,7 +142,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, extaddr, msg->msg_name); - if (extaddr->smctp_halen > sizeof(cb->haddr)) { + if (!mctp_sockaddr_ext_is_ok(extaddr) || + extaddr->smctp_halen > sizeof(cb->haddr)) { rc = -EINVAL; goto err_free; } @@ -224,6 +232,7 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*ae); ae->smctp_ifindex = cb->ifindex; ae->smctp_halen = cb->halen; + memset(ae->__smctp_pad0, 0x0, sizeof(ae->__smctp_pad0)); memset(ae->smctp_haddr, 0x0, sizeof(ae->smctp_haddr)); memcpy(ae->smctp_haddr, cb->haddr, cb->halen); } -- cgit v1.2.3-59-g8ed1b From af1877b6cad16bdd8d8d93ca1c7b37e8f21ef4e3 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 3 Nov 2021 20:48:37 +0800 Subject: net/smc: Print function name in smcr_link_down tracepoint This makes the output of smcr_link_down tracepoint easier to use and understand without additional translating function's pointer address. It prints the function name with offset: -0 [000] ..s. 69.087164: smcr_link_down: lnk=00000000dab41cdc lgr=000000007d5d8e24 state=0 rc=1 dev=mlx5_0 location=smc_wr_tx_tasklet_fn+0x5ef/0x6f0 [smc] Link: https://lore.kernel.org/netdev/11f17a34-fd35-f2ec-3f20-dd0c34e55fde@linux.ibm.com/ Signed-off-by: Tony Lu Reviewed-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc_tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index b4c36795a928..ec17f29646f5 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -99,7 +99,7 @@ TRACE_EVENT(smcr_link_down, __entry->location = location; ), - TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p", + TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%pS", __entry->lnk, __entry->lgr, __entry->state, __get_str(name), __entry->location) -- cgit v1.2.3-59-g8ed1b From 70bf363d7adb3a428773bc905011d0ff923ba747 Mon Sep 17 00:00:00 2001 From: Nghia Le Date: Thu, 4 Nov 2021 21:37:40 +0700 Subject: ipv6: remove useless assignment to newinet in tcp_v6_syn_recv_sock() The newinet value is initialized with inet_sk() in a block code to handle sockets for the ETH_P_IP protocol. Along this code path, newinet is never read. Thus, assignment to newinet is needless and can be removed. Signed-off-by: Nghia Le Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20211104143740.32446-1-nghialm78@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2cc9b0e53ad1..551fce49841d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1263,7 +1263,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); - newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); -- cgit v1.2.3-59-g8ed1b From c0f49d98006f2db3333b917caac65bce2af9865c Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:25 +0800 Subject: can: j1939: j1939_tp_cmd_recv(): ignore abort message in the BAM transport This patch prevents BAM transport from being closed by receiving abort message, as specified in SAE-J1939-82 2015 (A.3.3 Row 4). Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-2-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 6c0a0ebdd024..05eb3d059e17 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2085,6 +2085,12 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) break; case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ + if (j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } + if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_abort(priv, skb, true); -- cgit v1.2.3-59-g8ed1b From a79305e156db3d24fcd8eb649cdb3c3b2350e5c2 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:26 +0800 Subject: can: j1939: j1939_can_recv(): ignore messages with invalid source address According to SAE-J1939-82 2015 (A.3.6 Row 2), a receiver should never send TP.CM_CTS to the global address, so we can add a check in j1939_can_recv() to drop messages with invalid source address. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-3-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/main.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 9bc55ecb37f9..8452b0fbb78c 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -75,6 +75,13 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; + + if (!j1939_address_is_valid(skcb->addr.sa)) { + netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", + __func__); + goto done; + } + if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; -- cgit v1.2.3-59-g8ed1b From 164051a6ab5445bd97f719f50b16db8b32174269 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:27 +0800 Subject: can: j1939: j1939_tp_cmd_recv(): check the dst address of TP.CM_BAM The TP.CM_BAM message must be sent to the global address [1], so add a check to drop TP.CM_BAM sent to a non-global address. Without this patch, the receiver will treat the following packets as normal RTS/CTS transport: 18EC0102#20090002FF002301 18EB0102#0100000000000000 18EB0102#020000FFFFFFFFFF [1] SAE-J1939-82 2015 A.3.3 Row 1. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-4-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 05eb3d059e17..a271688780a2 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2023,6 +2023,11 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) extd = J1939_ETP; fallthrough; case J1939_TP_CMD_BAM: + if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } fallthrough; case J1939_TP_CMD_RTS: if (skcb->addr.type != extd) -- cgit v1.2.3-59-g8ed1b From e7ea51cd879c8214a824717d28a169b5f2262c02 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 5 Nov 2021 20:30:27 +0300 Subject: sctp: remove unreachable code from sctp_sf_violation_chunk() sctp_sf_violation_chunk() is not called with asoc argument equal to NULL, but if that happens it would lead to NULL pointer dereference in sctp_vtag_verify(). The patch removes code that handles NULL asoc in sctp_sf_violation_chunk(). Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Alexey Khoroshilov Proposed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5fabaa54b77d..39ba82ee87ce 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4898,9 +4898,6 @@ static enum sctp_disposition sctp_sf_violation_chunk( { static const char err_str[] = "The following chunk violates protocol:"; - if (!asoc) - return sctp_sf_violation(net, ep, asoc, type, arg, commands); - return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str, sizeof(err_str)); } -- cgit v1.2.3-59-g8ed1b From 40a34121ac1dc52ed9cd34a8f4e48e32517a52fd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:32 -0700 Subject: bpf, sockmap: Use stricter sk state checks in sk_lookup_assign In order to fix an issue with sockets in TCP sockmap redirect cases we plan to allow CLOSE state sockets to exist in the sockmap. However, the check in bpf_sk_lookup_assign() currently only invalidates sockets in the TCP_ESTABLISHED case relying on the checks on sockmap insert to ensure we never SOCK_CLOSE state sockets in the map. To prepare for this change we flip the logic in bpf_sk_lookup_assign() to explicitly test for the accepted cases. Namely, a tcp socket in TCP_LISTEN or a udp socket in TCP_CLOSE state. This also makes the code more resilent to future changes. Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-2-john.fastabend@gmail.com --- include/linux/skmsg.h | 12 ++++++++++++ net/core/filter.c | 6 ++++-- net/core/sock_map.c | 6 ------ 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index b4256847c707..584d94be9c8b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -507,6 +507,18 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } +static inline bool sk_is_tcp(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; +} + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) diff --git a/net/core/filter.c b/net/core/filter.c index 8e8d3b49c297..a68418268e92 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10423,8 +10423,10 @@ BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ - if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) - return -ESOCKTNOSUPPORT; /* reject connected sockets */ + if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) + return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ + if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) + return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e252b8ec2b85..f39ef79ced67 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -511,12 +511,6 @@ static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } -static bool sk_is_tcp(const struct sock *sk) -{ - return sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP; -} - static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) -- cgit v1.2.3-59-g8ed1b From b8b8315e39ffaca82e79d86dde26e9144addf66b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:33 -0700 Subject: bpf, sockmap: Remove unhash handler for BPF sockmap usage We do not need to handle unhash from BPF side we can simply wait for the close to happen. The original concern was a socket could transition from ESTABLISHED state to a new state while the BPF hook was still attached. But, we convinced ourself this is no longer possible and we also improved BPF sockmap to handle listen sockets so this is no longer a problem. More importantly though there are cases where unhash is called when data is in the receive queue. The BPF unhash logic will flush this data which is wrong. To be correct it should keep the data in the receive queue and allow a receiving application to continue reading the data. This may happen when tcp_abort() is received for example. Instead of complicating the logic in unhash simply moving all this to tcp_close() hook solves this. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-3-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5f4d6f45d87f..246f725b78c9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -475,7 +475,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; - prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; -- cgit v1.2.3-59-g8ed1b From c5d2177a72a1659554922728fc407f59950aa929 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:34 -0700 Subject: bpf, sockmap: Fix race in ingress receive verdict with redirect to self A socket in a sockmap may have different combinations of programs attached depending on configuration. There can be no programs in which case the socket acts as a sink only. There can be a TX program in this case a BPF program is attached to sending side, but no RX program is attached. There can be an RX program only where sends have no BPF program attached, but receives are hooked with BPF. And finally, both TX and RX programs may be attached. Giving us the permutations: None, Tx, Rx, and TxRx To date most of our use cases have been TX case being used as a fast datapath to directly copy between local application and a userspace proxy. Or Rx cases and TxRX applications that are operating an in kernel based proxy. The traffic in the first case where we hook applications into a userspace application looks like this: AppA redirect AppB Tx <-----------> Rx | | + + TCP <--> lo <--> TCP In this case all traffic from AppA (after 3whs) is copied into the AppB ingress queue and no traffic is ever on the TCP recieive_queue. In the second case the application never receives, except in some rare error cases, traffic on the actual user space socket. Instead the send happens in the kernel. AppProxy socket pool sk0 ------------->{sk1,sk2, skn} ^ | | | | v ingress lb egress TCP TCP Here because traffic is never read off the socket with userspace recv() APIs there is only ever one reader on the sk receive_queue. Namely the BPF programs. However, we've started to introduce a third configuration where the BPF program on receive should process the data, but then the normal case is to push the data into the receive queue of AppB. AppB recv() (userspace) ----------------------- tcp_bpf_recvmsg() (kernel) | | | | | | ingress_msgQ | | | RX_BPF | | | v v sk->receive_queue This is different from the App{A,B} redirect because traffic is first received on the sk->receive_queue. Now for the issue. The tcp_bpf_recvmsg() handler first checks the ingress_msg queue for any data handled by the BPF rx program and returned with PASS code so that it was enqueued on the ingress msg queue. Then if no data exists on that queue it checks the socket receive queue. Unfortunately, this is the same receive_queue the BPF program is reading data off of. So we get a race. Its possible for the recvmsg() hook to pull data off the receive_queue before the BPF hook has a chance to read it. It typically happens when an application is banging on recv() and getting EAGAINs. Until they manage to race with the RX BPF program. To fix this we note that before this patch at attach time when the socket is loaded into the map we check if it needs a TX program or just the base set of proto bpf hooks. Then it uses the above general RX hook regardless of if we have a BPF program attached at rx or not. This patch now extends this check to handle all cases enumerated above, TX, RX, TXRX, and none. And to fix above race when an RX program is attached we use a new hook that is nearly identical to the old one except now we do not let the recv() call skip the RX BPF program. Now only the BPF program pulls data from sk->receive_queue and recv() only pulls data from the ingress msgQ post BPF program handling. With this resolved our AppB from above has been up and running for many hours without detecting any errors. We do this by correlating counters in RX BPF events and the AppB to ensure data is never skipping the BPF program. Selftests, was not able to detect this because we only run them for a short period of time on well ordered send/recvs so we don't get any of the noise we see in real application environments. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-4-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 246f725b78c9..f70aa0932bd6 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -172,6 +172,41 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } +static int tcp_bpf_recvmsg_parser(struct sock *sk, + struct msghdr *msg, + size_t len, + int nonblock, + int flags, + int *addr_len) +{ + struct sk_psock *psock; + int copied; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + long timeo; + int data; + + timeo = sock_rcvtimeo(sk, nonblock); + data = tcp_msg_wait_data(sk, psock, timeo); + if (data && !sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + copied = -EAGAIN; + } + release_sock(sk); + sk_psock_put(sk, psock); + return copied; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -464,6 +499,8 @@ enum { enum { TCP_BPF_BASE, TCP_BPF_TX, + TCP_BPF_RX, + TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; @@ -482,6 +519,12 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; + + prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; + prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; + + prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; + prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -519,6 +562,10 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (psock->progs.stream_verdict || psock->progs.skb_verdict) { + config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; + } + if (restore) { if (inet_csk_has_ulp(sk)) { /* TLS does not have an unhash proto in SW cases, -- cgit v1.2.3-59-g8ed1b From e0dc3b93bd7bcff8c3813d1df43e0908499c7cf0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:35 -0700 Subject: bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and colliding Strparser is reusing the qdisc_skb_cb struct to stash the skb message handling progress, e.g. offset and length of the skb. First this is poorly named and inherits a struct from qdisc that doesn't reflect the actual usage of cb[] at this layer. But, more importantly strparser is using the following to access its metadata. (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)) Where _strp_msg is defined as: struct _strp_msg { struct strp_msg strp; /* 0 8 */ int accum_len; /* 8 4 */ /* size: 12, cachelines: 1, members: 2 */ /* last cacheline: 12 bytes */ }; So we use 12 bytes of ->data[] in struct. However in BPF code running parser and verdict the user has read capabilities into the data[] array as well. Its not too problematic, but we should not be exposing internal state to BPF program. If its really needed then we can use the probe_read() APIs which allow reading kernel memory. And I don't believe cb[] layer poses any API breakage by moving this around because programs can't depend on cb[] across layers. In order to fix another issue with a ctx rewrite we need to stash a temp variable somewhere. To make this work cleanly this patch builds a cb struct for sk_skb types called sk_skb_cb struct. Then we can use this consistently in the strparser, sockmap space. Additionally we can start allowing ->cb[] write access after this. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-5-john.fastabend@gmail.com --- include/net/strparser.h | 16 +++++++++++++++- net/core/filter.c | 22 ++++++++++++++++++++++ net/strparser/strparser.c | 10 +--------- 3 files changed, 38 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/strparser.h b/include/net/strparser.h index 1d20b98493a1..bec1439bd3be 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -54,10 +54,24 @@ struct strp_msg { int offset; }; +struct _strp_msg { + /* Internal cb structure. struct strp_msg must be first for passing + * to upper layer. + */ + struct strp_msg strp; + int accum_len; +}; + +struct sk_skb_cb { +#define SK_SKB_CB_PRIV_LEN 20 + unsigned char data[SK_SKB_CB_PRIV_LEN]; + struct _strp_msg strp; +}; + static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ diff --git a/net/core/filter.c b/net/core/filter.c index a68418268e92..c3936d0724b8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9782,11 +9782,33 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 9c0343568d2a..1a72c67afed5 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -27,18 +27,10 @@ static struct workqueue_struct *strp_wq; -struct _strp_msg { - /* Internal cb structure. struct strp_msg must be first for passing - * to upper layer. - */ - struct strp_msg strp; - int accum_len; -}; - static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { return (struct _strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Lower lock held */ -- cgit v1.2.3-59-g8ed1b From b2c4618162ec615a15883a804cce7e27afecfa58 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Wed, 3 Nov 2021 13:47:36 -0700 Subject: bpf, sockmap: sk_skb data_end access incorrect when src_reg = dst_reg The current conversion of skb->data_end reads like this: ; data_end = (void*)(long)skb->data_end; 559: (79) r1 = *(u64 *)(r2 +200) ; r1 = skb->data 560: (61) r11 = *(u32 *)(r2 +112) ; r11 = skb->len 561: (0f) r1 += r11 562: (61) r11 = *(u32 *)(r2 +116) 563: (1f) r1 -= r11 But similar to the case in 84f44df664e9 ("bpf: sock_ops sk access may stomp registers when dst_reg = src_reg"), the code will read an incorrect skb->len when src == dst. In this case we end up generating this xlated code: ; data_end = (void*)(long)skb->data_end; 559: (79) r1 = *(u64 *)(r1 +200) ; r1 = skb->data 560: (61) r11 = *(u32 *)(r1 +112) ; r11 = (skb->data)->len 561: (0f) r1 += r11 562: (61) r11 = *(u32 *)(r1 +116) 563: (1f) r1 -= r11 ... where line 560 is the reading 4B of (skb->data + 112) instead of the intended skb->len Here the skb pointer in r1 gets set to skb->data and the later deref for skb->len ends up following skb->data instead of skb. This fixes the issue similarly to the patch mentioned above by creating an additional temporary variable and using to store the register when dst_reg = src_reg. We name the variable bpf_temp_reg and place it in the cb context for sk_skb. Then we restore from the temp to ensure nothing is lost. Fixes: 16137b09a66f2 ("bpf: Compute data_end dynamically with JIT code") Signed-off-by: Jussi Maki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-6-john.fastabend@gmail.com --- include/net/strparser.h | 4 ++++ net/core/filter.c | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/strparser.h b/include/net/strparser.h index bec1439bd3be..732b7097d78e 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -66,6 +66,10 @@ struct sk_skb_cb { #define SK_SKB_CB_PRIV_LEN 20 unsigned char data[SK_SKB_CB_PRIV_LEN]; struct _strp_msg strp; + /* temp_reg is a temporary register used for bpf_convert_data_end_access + * when dst_reg == src_reg. + */ + u64 temp_reg; }; static inline struct strp_msg *strp_msg(struct sk_buff *skb) diff --git a/net/core/filter.c b/net/core/filter.c index c3936d0724b8..e471c9b09670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9756,22 +9756,46 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { - /* si->dst_reg = skb->data */ + int reg; + int temp_reg_off = offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, temp_reg); + + if (si->src_reg == si->dst_reg) { + /* We need an extra register, choose and save a register. */ + reg = BPF_REG_9; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); + } else { + reg = si->dst_reg; + } + + /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), - si->dst_reg, si->src_reg, + reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); - /* si->dst_reg = skb->data + skb->len */ - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); - /* si->dst_reg = skb->data + skb->len - skb->data_len */ - *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + /* reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); + + if (si->src_reg == si->dst_reg) { + /* Restore the saved register */ + *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, reg); + *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); + } return insn; } -- cgit v1.2.3-59-g8ed1b From 6dc25401cba4d428328eade8ceae717633fdd702 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Nov 2021 10:08:15 -0800 Subject: net/sched: sch_taprio: fix undefined behavior in ktime_mono_to_any 1) if q->tk_offset == TK_OFFS_MAX, then get_tcp_tstamp() calls ktime_mono_to_any() with out-of-bound value. 2) if q->tk_offset is changed in taprio_parse_clockid(), taprio_get_time() might also call ktime_mono_to_any() with out-of-bound value as sysbot found: UBSAN: array-index-out-of-bounds in kernel/time/timekeeping.c:908:27 index 3 is out of range for type 'ktime_t *[3]' CPU: 1 PID: 25668 Comm: kworker/u4:0 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: bat_events batadv_iv_send_outstanding_bat_ogm_packet Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_out_of_bounds.cold+0x62/0x6c lib/ubsan.c:291 ktime_mono_to_any+0x1d4/0x1e0 kernel/time/timekeeping.c:908 get_tcp_tstamp net/sched/sch_taprio.c:322 [inline] get_packet_txtime net/sched/sch_taprio.c:353 [inline] taprio_enqueue_one+0x5b0/0x1460 net/sched/sch_taprio.c:420 taprio_enqueue+0x3b1/0x730 net/sched/sch_taprio.c:485 dev_qdisc_enqueue+0x40/0x300 net/core/dev.c:3785 __dev_xmit_skb net/core/dev.c:3869 [inline] __dev_queue_xmit+0x1f6e/0x3630 net/core/dev.c:4194 batadv_send_skb_packet+0x4a9/0x5f0 net/batman-adv/send.c:108 batadv_iv_ogm_send_to_if net/batman-adv/bat_iv_ogm.c:393 [inline] batadv_iv_ogm_emit net/batman-adv/bat_iv_ogm.c:421 [inline] batadv_iv_send_outstanding_bat_ogm_packet+0x6d7/0x8e0 net/batman-adv/bat_iv_ogm.c:1701 process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298 worker_thread+0x658/0x11f0 kernel/workqueue.c:2445 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7ede7b03484b ("taprio: make clock reference conversions easier") Fixes: 54002066100b ("taprio: Adjust timestamps for TCP packets") Signed-off-by: Eric Dumazet Cc: Vedang Patel Reported-by: syzbot Reviewed-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20211108180815.1822479-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9ab068fa2672..377f896bdedc 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -95,18 +95,22 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched) return ns_to_ktime(sched->base_time); } -static ktime_t taprio_get_time(struct taprio_sched *q) +static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { - ktime_t mono = ktime_get(); + /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ + enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); - switch (q->tk_offset) { + switch (tk_offset) { case TK_OFFS_MAX: return mono; default: - return ktime_mono_to_any(mono, q->tk_offset); + return ktime_mono_to_any(mono, tk_offset); } +} - return KTIME_MAX; +static ktime_t taprio_get_time(const struct taprio_sched *q) +{ + return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) @@ -319,7 +323,7 @@ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) return 0; } - return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset); + return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from @@ -1352,6 +1356,7 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. @@ -1366,22 +1371,24 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, switch (clockid) { case CLOCK_REALTIME: - q->tk_offset = TK_OFFS_REAL; + tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: - q->tk_offset = TK_OFFS_MAX; + tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: - q->tk_offset = TK_OFFS_BOOT; + tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: - q->tk_offset = TK_OFFS_TAI; + tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } + /* This pairs with READ_ONCE() in taprio_mono_to_any */ + WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { -- cgit v1.2.3-59-g8ed1b From c7cd82b90599fa10915f41e3dd9098a77d0aa7b6 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Tue, 9 Nov 2021 00:15:02 +0000 Subject: vsock: prevent unnecessary refcnt inc for nonblocking connect Currently vosck_connect() increments sock refcount for nonblocking socket each time it's called, which can lead to memory leak if it's called multiple times because connect timeout function decrements sock refcount only once. Fixes it by making vsock_connect() return -EALREADY immediately when sock state is already SS_CONNECTING. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reviewed-by: Stefano Garzarella Signed-off-by: Eiichi Tsukata Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7d851eb3a683..ed0df839c38c 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1322,6 +1322,8 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * non-blocking call. */ err = -EALREADY; + if (flags & O_NONBLOCK) + goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || -- cgit v1.2.3-59-g8ed1b From e5d5aadcf3cd59949316df49c27cb21788d7efe4 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Wed, 10 Nov 2021 15:02:34 +0800 Subject: net/smc: fix sk_refcnt underflow on linkdown and fallback We got the following WARNING when running ab/nginx test with RDMA link flapping (up-down-up). The reason is when smc_sock fallback and at linkdown happens simultaneously, we may got the following situation: __smc_lgr_terminate() --> smc_conn_kill() --> smc_close_active_abort() smc_sock->sk_state = SMC_CLOSED sock_put(smc_sock) smc_sock was set to SMC_CLOSED and sock_put() been called when terminate the link group. But later application call close() on the socket, then we got: __smc_release(): if (smc_sock->fallback) smc_sock->sk_state = SMC_CLOSED sock_put(smc_sock) Again we set the smc_sock to CLOSED through it's already in CLOSED state, and double put the refcnt, so the following warning happens: refcount_t: underflow; use-after-free. WARNING: CPU: 5 PID: 860 at lib/refcount.c:28 refcount_warn_saturate+0x8d/0xf0 Modules linked in: CPU: 5 PID: 860 Comm: nginx Not tainted 5.10.46+ #403 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/01/2014 RIP: 0010:refcount_warn_saturate+0x8d/0xf0 Code: 05 5c 1e b5 01 01 e8 52 25 bc ff 0f 0b c3 80 3d 4f 1e b5 01 00 75 ad 48 RSP: 0018:ffffc90000527e50 EFLAGS: 00010286 RAX: 0000000000000026 RBX: ffff8881300df2c0 RCX: 0000000000000027 RDX: 0000000000000000 RSI: ffff88813bd58040 RDI: ffff88813bd58048 RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000001 R10: ffff8881300df2c0 R11: ffffc90000527c78 R12: ffff8881300df340 R13: ffff8881300df930 R14: ffff88810b3dad80 R15: ffff8881300df4f8 FS: 00007f739de8fb80(0000) GS:ffff88813bd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000a01b008 CR3: 0000000111b64003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: smc_release+0x353/0x3f0 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0x93/0x230 task_work_run+0x65/0xa0 exit_to_user_mode_prepare+0xf9/0x100 syscall_exit_to_user_mode+0x27/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This patch adds check in __smc_release() to make sure we won't do an extra sock_put() and set the socket to CLOSED when its already in CLOSED state. Fixes: 51f1de79ad8e (net/smc: replace sock_put worker by socket refcounting) Signed-off-by: Dust Li Reviewed-by: Tony Lu Signed-off-by: Dust Li Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0cf7ed2f5d41..59284da9116d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -149,14 +149,18 @@ static int __smc_release(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { - if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) - sock_put(sk); /* passive closing */ - if (sk->sk_state == SMC_LISTEN) { - /* wake up clcsock accept */ - rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + if (sk->sk_state != SMC_CLOSED) { + if (sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ + if (sk->sk_state == SMC_LISTEN) { + /* wake up clcsock accept */ + rc = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + } + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); } - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); smc_restore_fallback_changes(smc); } -- cgit v1.2.3-59-g8ed1b From 0315a075f1343966ea2d9a085666a88a69ea6a3d Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 10 Nov 2021 20:56:05 +0100 Subject: net: fix premature exit from NAPI state polling in napi_disable() Commit 719c57197010 ("net: make napi_disable() symmetric with enable") accidentally introduced a bug sometimes leading to a kernel BUG when bringing an iface up/down under heavy traffic load. Prior to this commit, napi_disable() was polling n->state until none of (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC) is set and then always flip them. Now there's a possibility to get away with the NAPIF_STATE_SCHE unset as 'continue' drops us to the cmpxchg() call with an uninitialized variable, rather than straight to another round of the state check. Error path looks like: napi_disable(): unsigned long val, new; /* new is uninitialized */ do { val = READ_ONCE(n->state); /* NAPIF_STATE_NPSVC and/or NAPIF_STATE_SCHED is set */ if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { /* true */ usleep_range(20, 200); continue; /* go straight to the condition check */ } new = val | <...> } while (cmpxchg(&n->state, val, new) != val); /* state == val, cmpxchg() writes garbage */ napi_enable(): do { val = READ_ONCE(n->state); BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); /* 50/50 boom */ <...> while the typical BUG splat is like: [ 172.652461] ------------[ cut here ]------------ [ 172.652462] kernel BUG at net/core/dev.c:6937! [ 172.656914] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 172.661966] CPU: 36 PID: 2829 Comm: xdp_redirect_cp Tainted: G I 5.15.0 #42 [ 172.670222] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0014.082620210524 08/26/2021 [ 172.680646] RIP: 0010:napi_enable+0x5a/0xd0 [ 172.684832] Code: 07 49 81 cc 00 01 00 00 4c 89 e2 48 89 d8 80 e6 fb f0 48 0f b1 55 10 48 39 c3 74 10 48 8b 5d 10 f6 c7 04 75 3d f6 c3 01 75 b4 <0f> 0b 5b 5d 41 5c c3 65 ff 05 b8 e5 61 53 48 c7 c6 c0 f3 34 ad 48 [ 172.703578] RSP: 0018:ffffa3c9497477a8 EFLAGS: 00010246 [ 172.708803] RAX: ffffa3c96615a014 RBX: 0000000000000000 RCX: ffff8a4b575301a0 < snip > [ 172.782403] Call Trace: [ 172.784857] [ 172.786963] ice_up_complete+0x6f/0x210 [ice] [ 172.791349] ice_xdp+0x136/0x320 [ice] [ 172.795108] ? ice_change_mtu+0x180/0x180 [ice] [ 172.799648] dev_xdp_install+0x61/0xe0 [ 172.803401] dev_xdp_attach+0x1e0/0x550 [ 172.807240] dev_change_xdp_fd+0x1e6/0x220 [ 172.811338] do_setlink+0xee8/0x1010 [ 172.814917] rtnl_setlink+0xe5/0x170 [ 172.818499] ? bpf_lsm_binder_set_context_mgr+0x10/0x10 [ 172.823732] ? security_capable+0x36/0x50 < snip > Fix this by replacing 'do { } while (cmpxchg())' with an "infinite" for-loop with an explicit break. From v1 [0]: - just use a for-loop to simplify both the fix and the existing code (Eric). [0] https://lore.kernel.org/netdev/20211110191126.1214-1-alexandr.lobakin@intel.com Fixes: 719c57197010 ("net: make napi_disable() symmetric with enable") Suggested-by: Eric Dumazet # for-loop Signed-off-by: Alexander Lobakin Reviewed-by: Jesse Brandeburg Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20211110195605.1304-1-alexandr.lobakin@intel.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index edeb811c454e..15ac064b5562 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6928,7 +6928,7 @@ void napi_disable(struct napi_struct *n) might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); - do { + for ( ; ; ) { val = READ_ONCE(n->state); if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); @@ -6937,7 +6937,10 @@ void napi_disable(struct napi_struct *n) new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); - } while (cmpxchg(&n->state, val, new) != val); + + if (cmpxchg(&n->state, val, new) == val) + break; + } hrtimer_cancel(&n->timer); -- cgit v1.2.3-59-g8ed1b