aboutsummaryrefslogtreecommitdiffstats
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h796
1 files changed, 572 insertions, 224 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 328564525526..5db02546941c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -56,17 +56,19 @@
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
-#include <linux/filter.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
-
+#include <linux/sockptr.h>
+#include <linux/indirect_call_wrapper.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/llist.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
#include <net/l3mdev.h>
+#include <uapi/linux/socket.h>
/*
* This structure really needs to be cleaned up.
@@ -159,9 +161,6 @@ typedef __u64 __bitwise __addrpair;
* for struct sock and struct inet_timewait_sock.
*/
struct sock_common {
- /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
- * address on 64bit arches : cf INET_MATCH()
- */
union {
__addrpair skc_addrpair;
struct {
@@ -225,7 +224,7 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
unsigned short skc_tx_queue_mapping;
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
unsigned short skc_rx_queue_mapping;
#endif
union {
@@ -245,7 +244,8 @@ struct sock_common {
/* public: */
};
-struct bpf_sk_storage;
+struct bpf_local_storage;
+struct sk_filter;
/**
* struct sock - network layer representation of sockets
@@ -257,10 +257,11 @@ struct bpf_sk_storage;
* @sk_rcvbuf: size of receive buffer in bytes
* @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux
+ * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
+ * @sk_rx_dst_cookie: cookie for @sk_rx_dst
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
- * @sk_rx_skb_cache: cache copy of recently accessed RX skb
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
@@ -268,6 +269,7 @@ struct bpf_sk_storage;
* @sk_omem_alloc: "o" is "option" or "other"
* @sk_wmem_queued: persistent queue size
* @sk_forward_alloc: space allocated forward
+ * @sk_reserved_mem: space reserved and non-reclaimable for the socket
* @sk_napi_id: id of the last napi context to receive data for sk
* @sk_ll_usec: usecs to busypoll when there is no data
* @sk_allocation: allocation mode
@@ -280,9 +282,7 @@ struct bpf_sk_storage;
* @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
* @sk_no_check_rx: allow zero checksum in RX packets
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
- * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
- * @sk_route_forced_caps: static, forced route capabilities
- * (set in tcp_init_sock())
+ * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden.
* @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
* @sk_gso_max_size: Maximum GSO segment size to build
* @sk_gso_max_segs: Maximum number of GSO segments
@@ -300,20 +300,26 @@ struct bpf_sk_storage;
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
+ * @sk_prefer_busy_poll: prefer busypolling over softirq processing
+ * @sk_busy_poll_budget: napi processing budget when busypolling
* @sk_priority: %SO_PRIORITY setting
* @sk_type: socket type (%SOCK_STREAM, etc)
* @sk_protocol: which protocol this socket belongs in this network family
+ * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred
* @sk_peer_pid: &struct pid for this socket's peer
* @sk_peer_cred: %SO_PEERCRED setting
* @sk_rcvlowat: %SO_RCVLOWAT setting
* @sk_rcvtimeo: %SO_RCVTIMEO setting
* @sk_sndtimeo: %SO_SNDTIMEO setting
* @sk_txhash: computed flow hash for use on transmit
+ * @sk_txrehash: enable TX hash rethink
* @sk_filter: socket filtering instructions
* @sk_timer: sock cleanup timer
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
- * @sk_tsflags: SO_TIMESTAMPING socket options
+ * @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
+ * for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests
* @sk_zckey: counter to order MSG_ZEROCOPY notifications
* @sk_socket: Identd and reporting IO signals
@@ -322,7 +328,6 @@ struct bpf_sk_storage;
* @sk_peek_off: current peek_offset value
* @sk_send_head: front of stuff to transmit
* @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
- * @sk_tx_skb_cache: cache copy of recently accessed TX skb
* @sk_security: used by security modules
* @sk_mark: generic packet mark
* @sk_cgrp_data: cgroup data for this cgroup
@@ -342,6 +347,8 @@ struct bpf_sk_storage;
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
+ * @ns_tracker: tracker for netns reference
+ * @sk_bind2_node: bind node in the bhash2 table
*/
struct sock {
/*
@@ -353,7 +360,7 @@ struct sock {
#define sk_nulls_node __sk_common.skc_nulls_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping
#endif
@@ -383,11 +390,15 @@ struct sock {
#define sk_flags __sk_common.skc_flags
#define sk_rxhash __sk_common.skc_rxhash
+ /* early demux fields */
+ struct dst_entry __rcu *sk_rx_dst;
+ int sk_rx_dst_ifindex;
+ u32 sk_rx_dst_cookie;
+
socket_lock_t sk_lock;
atomic_t sk_drops;
int sk_rcvlowat;
struct sk_buff_head sk_error_queue;
- struct sk_buff *sk_rx_skb_cache;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
@@ -403,9 +414,11 @@ struct sock {
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
+
#define sk_rmem_alloc sk_backlog.rmem_alloc
int sk_forward_alloc;
+ u32 sk_reserved_mem;
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sk_ll_usec;
/* ===== mostly read cache line ===== */
@@ -423,7 +436,7 @@ struct sock {
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
#endif
- struct dst_entry *sk_rx_dst;
+
struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc;
int sk_sndbuf;
@@ -436,7 +449,6 @@ struct sock {
struct sk_buff *sk_send_head;
struct rb_root tcp_rtx_queue;
};
- struct sk_buff *sk_tx_skb_cache;
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
@@ -450,8 +462,6 @@ struct sock {
unsigned long sk_max_pacing_rate;
struct page_frag sk_frag;
netdev_features_t sk_route_caps;
- netdev_features_t sk_route_nocaps;
- netdev_features_t sk_route_forced_caps;
int sk_gso_type;
unsigned int sk_gso_max_size;
gfp_t sk_allocation;
@@ -461,7 +471,7 @@ struct sock {
* Because of non atomicity rules, all
* changes are protected by socket lock.
*/
- u8 sk_padding : 1,
+ u8 sk_gso_disabled : 1,
sk_kern_sock : 1,
sk_no_check_tx : 1,
sk_no_check_rx : 1,
@@ -478,8 +488,16 @@ struct sock {
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
+ u8 sk_txrehash;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ u8 sk_prefer_busy_poll;
+ u16 sk_busy_poll_budget;
+#endif
+ spinlock_t sk_peer_lock;
+ int sk_bind_phc;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
+
long sk_rcvtimeo;
ktime_t sk_stamp;
#if BITS_PER_LONG==32
@@ -487,7 +505,7 @@ struct sock {
#endif
u16 sk_tsflags;
u8 sk_shutdown;
- u32 sk_tskey;
+ atomic_t sk_tskey;
atomic_t sk_zckey;
u8 sk_clockid;
@@ -516,9 +534,11 @@ struct sock {
void (*sk_destruct)(struct sock *sk);
struct sock_reuseport __rcu *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
- struct bpf_sk_storage __rcu *sk_bpf_storage;
+ struct bpf_local_storage __rcu *sk_bpf_storage;
#endif
struct rcu_head sk_rcu;
+ netns_tracker ns_tracker;
+ struct hlist_node sk_bind2_node;
};
enum sk_pacing {
@@ -527,10 +547,109 @@ enum sk_pacing {
SK_PACING_FQ = 2,
};
+/* flag bits in sk_user_data
+ *
+ * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might
+ * not be suitable for copying when cloning the socket. For instance,
+ * it can point to a reference counted object. sk_user_data bottom
+ * bit is set if pointer must not be copied.
+ *
+ * - SK_USER_DATA_BPF: Mark whether sk_user_data field is
+ * managed/owned by a BPF reuseport array. This bit should be set
+ * when sk_user_data's sk is added to the bpf's reuseport_array.
+ *
+ * - SK_USER_DATA_PSOCK: Mark whether pointer stored in
+ * sk_user_data points to psock type. This bit should be set
+ * when sk_user_data is assigned to a psock object.
+ */
+#define SK_USER_DATA_NOCOPY 1UL
+#define SK_USER_DATA_BPF 2UL
+#define SK_USER_DATA_PSOCK 4UL
+#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
+ SK_USER_DATA_PSOCK)
+
+/**
+ * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
+ * @sk: socket
+ */
+static inline bool sk_user_data_is_nocopy(const struct sock *sk)
+{
+ return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
+}
+
#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
-#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk)))
-#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr)
+/**
+ * __locked_read_sk_user_data_with_flags - return the pointer
+ * only if argument flags all has been set in sk_user_data. Otherwise
+ * return NULL
+ *
+ * @sk: socket
+ * @flags: flag bits
+ *
+ * The caller must be holding sk->sk_callback_lock.
+ */
+static inline void *
+__locked_read_sk_user_data_with_flags(const struct sock *sk,
+ uintptr_t flags)
+{
+ uintptr_t sk_user_data =
+ (uintptr_t)rcu_dereference_check(__sk_user_data(sk),
+ lockdep_is_held(&sk->sk_callback_lock));
+
+ WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
+
+ if ((sk_user_data & flags) == flags)
+ return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ return NULL;
+}
+
+/**
+ * __rcu_dereference_sk_user_data_with_flags - return the pointer
+ * only if argument flags all has been set in sk_user_data. Otherwise
+ * return NULL
+ *
+ * @sk: socket
+ * @flags: flag bits
+ */
+static inline void *
+__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
+ uintptr_t flags)
+{
+ uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));
+
+ WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
+
+ if ((sk_user_data & flags) == flags)
+ return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ return NULL;
+}
+
+#define rcu_dereference_sk_user_data(sk) \
+ __rcu_dereference_sk_user_data_with_flags(sk, 0)
+#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \
+({ \
+ uintptr_t __tmp1 = (uintptr_t)(ptr), \
+ __tmp2 = (uintptr_t)(flags); \
+ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \
+ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \
+ rcu_assign_pointer(__sk_user_data((sk)), \
+ __tmp1 | __tmp2); \
+})
+#define rcu_assign_sk_user_data(sk, ptr) \
+ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)
+
+static inline
+struct net *sock_net(const struct sock *sk)
+{
+ return read_pnet(&sk->sk_net);
+}
+
+static inline
+void sock_net_set(struct sock *sk, struct net *net)
+{
+ write_pnet(&sk->sk_net, net);
+}
/*
* SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
@@ -545,7 +664,7 @@ enum sk_pacing {
int sk_set_peek_off(struct sock *sk, int val);
-static inline int sk_peek_offset(struct sock *sk, int flags)
+static inline int sk_peek_offset(const struct sock *sk, int flags)
{
if (unlikely(flags & MSG_PEEK)) {
return READ_ONCE(sk->sk_peek_off);
@@ -625,11 +744,6 @@ static inline void sk_node_init(struct hlist_node *node)
node->pprev = NULL;
}
-static inline void sk_nulls_node_init(struct hlist_nulls_node *node)
-{
- node->pprev = NULL;
-}
-
static inline void __sk_del_node(struct sock *sk)
{
__hlist_del(&sk->sk_node);
@@ -753,6 +867,16 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list);
}
+static inline void __sk_del_bind2_node(struct sock *sk)
+{
+ __hlist_del(&sk->sk_bind2_node);
+}
+
+static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
+{
+ hlist_add_head(&sk->sk_bind2_node, list);
+}
+
#define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
@@ -770,6 +894,8 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
+#define sk_for_each_bound_bhash2(__sk, list) \
+ hlist_for_each_entry(__sk, list, sk_bind2_node)
/**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
@@ -785,7 +911,7 @@ static inline void sk_add_bind_node(struct sock *sk,
({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \
pos = rcu_dereference(hlist_next_rcu(pos)))
-static inline struct user_namespace *sk_user_ns(struct sock *sk)
+static inline struct user_namespace *sk_user_ns(const struct sock *sk)
{
/* Careful only use this in a context where these parameters
* can not change and must all be valid, such as recvmsg from
@@ -810,7 +936,6 @@ enum sock_flags {
SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
- SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
SOCK_MEMALLOC, /* VM depends on this socket for swapping */
SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */
SOCK_FASYNC, /* fasync() active */
@@ -827,11 +952,12 @@ enum sock_flags {
SOCK_TXTIME,
SOCK_XDP, /* XDP is attached */
SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
+ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
+static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
{
nsk->sk_flags = osk->sk_flags;
}
@@ -846,6 +972,15 @@ static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
__clear_bit(flag, &sk->sk_flags);
}
+static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
+ int valbool)
+{
+ if (valbool)
+ sock_set_flag(sk, bit);
+ else
+ sock_reset_flag(sk, bit);
+}
+
static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
{
return test_bit(flag, &sk->sk_flags);
@@ -857,6 +992,8 @@ static inline int sk_memalloc_socks(void)
{
return static_branch_unlikely(&memalloc_socks_key);
}
+
+void __receive_sock(struct file *file);
#else
static inline int sk_memalloc_socks(void)
@@ -864,6 +1001,8 @@ static inline int sk_memalloc_socks(void)
return 0;
}
+static inline void __receive_sock(struct file *file)
+{ }
#endif
static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
@@ -881,6 +1020,10 @@ static inline void sk_acceptq_added(struct sock *sk)
WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}
+/* Note: If you think the test should be:
+ * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
+ * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
+ */
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
@@ -955,12 +1098,18 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb));
+INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb));
+
static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
if (sk_memalloc_socks() && skb_pfmemalloc(skb))
return __sk_backlog_rcv(sk, skb);
- return sk->sk_backlog_rcv(sk, skb);
+ return INDIRECT_CALL_INET(sk->sk_backlog_rcv,
+ tcp_v6_do_rcv,
+ tcp_v4_do_rcv,
+ sk, skb);
}
static inline void sk_incoming_cpu_update(struct sock *sk)
@@ -1061,6 +1210,7 @@ struct inet_hashinfo;
struct raw_hashinfo;
struct smc_hashinfo;
struct module;
+struct sk_psock;
/*
* caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
@@ -1097,36 +1247,31 @@ struct proto {
void (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level,
- int optname, char __user *optval,
+ int optname, sockptr_t optval,
unsigned int optlen);
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
void (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
- int (*compat_setsockopt)(struct sock *sk,
- int level,
- int optname, char __user *optval,
- unsigned int optlen);
- int (*compat_getsockopt)(struct sock *sk,
- int level,
- int optname, char __user *optval,
- int __user *option);
int (*compat_ioctl)(struct sock *sk,
unsigned int cmd, unsigned long arg);
#endif
int (*sendmsg)(struct sock *sk, struct msghdr *msg,
size_t len);
int (*recvmsg)(struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags,
- int *addr_len);
+ size_t len, int flags, int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
- struct sockaddr *uaddr, int addr_len);
+ struct sockaddr *addr, int addr_len);
+ int (*bind_add)(struct sock *sk,
+ struct sockaddr *addr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
+ bool (*bpf_bypass_getsockopt)(int level,
+ int optname);
void (*release_cb)(struct sock *sk);
@@ -1135,19 +1280,31 @@ struct proto {
void (*unhash)(struct sock *sk);
void (*rehash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
+ void (*put_port)(struct sock *sk);
+#ifdef CONFIG_BPF_SYSCALL
+ int (*psock_update_sk_prot)(struct sock *sk,
+ struct sk_psock *psock,
+ bool restore);
+#endif
/* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
unsigned int inuse_idx;
#endif
+#if IS_ENABLED(CONFIG_MPTCP)
+ int (*forward_alloc_get)(const struct sock *sk);
+#endif
+
bool (*stream_memory_free)(const struct sock *sk, int wake);
- bool (*stream_memory_read)(const struct sock *sk);
+ bool (*sock_is_readable)(struct sock *sk);
/* Memory pressure */
void (*enter_memory_pressure)(struct sock *sk);
void (*leave_memory_pressure)(struct sock *sk);
atomic_long_t *memory_allocated; /* Current allocated memory. */
+ int __percpu *per_cpu_fw_alloc;
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
+
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
@@ -1171,7 +1328,7 @@ struct proto {
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
- struct percpu_counter *orphan_count;
+ unsigned int __percpu *orphan_count;
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
@@ -1223,13 +1380,25 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
#define sk_refcnt_debug_release(sk) do { } while (0)
#endif /* SOCK_REFCNT_DEBUG */
+INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
+
+static inline int sk_forward_alloc_get(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_MPTCP)
+ if (sk->sk_prot->forward_alloc_get)
+ return sk->sk_prot->forward_alloc_get(sk);
+#endif
+ return sk->sk_forward_alloc;
+}
+
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
return false;
return sk->sk_prot->stream_memory_free ?
- sk->sk_prot->stream_memory_free(sk, wake) : true;
+ INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free,
+ tcp_stream_memory_free, sk, wake) : true;
}
static inline bool sk_stream_memory_free(const struct sock *sk)
@@ -1277,31 +1446,60 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
}
static inline long
-sk_memory_allocated(const struct sock *sk)
+proto_memory_allocated(const struct proto *prot)
{
- return atomic_long_read(sk->sk_prot->memory_allocated);
+ return max(0L, atomic_long_read(prot->memory_allocated));
}
static inline long
+sk_memory_allocated(const struct sock *sk)
+{
+ return proto_memory_allocated(sk->sk_prot);
+}
+
+/* 1 MB per cpu, in page units */
+#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
+
+static inline void
sk_memory_allocated_add(struct sock *sk, int amt)
{
- return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
+ int local_reserve;
+
+ preempt_disable();
+ local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
+ if (local_reserve >= SK_MEMORY_PCPU_RESERVE) {
+ __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
+ atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
+ }
+ preempt_enable();
}
static inline void
sk_memory_allocated_sub(struct sock *sk, int amt)
{
- atomic_long_sub(amt, sk->sk_prot->memory_allocated);
+ int local_reserve;
+
+ preempt_disable();
+ local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
+ if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) {
+ __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
+ atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
+ }
+ preempt_enable();
}
+#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
+
static inline void sk_sockets_allocated_dec(struct sock *sk)
{
- percpu_counter_dec(sk->sk_prot->sockets_allocated);
+ percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1,
+ SK_ALLOC_PERCPU_COUNTER_BATCH);
}
static inline void sk_sockets_allocated_inc(struct sock *sk)
{
- percpu_counter_inc(sk->sk_prot->sockets_allocated);
+ percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1,
+ SK_ALLOC_PERCPU_COUNTER_BATCH);
}
static inline u64
@@ -1316,12 +1514,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
return percpu_counter_sum_positive(prot->sockets_allocated);
}
-static inline long
-proto_memory_allocated(struct proto *prot)
-{
- return atomic_long_read(prot->memory_allocated);
-}
-
static inline bool
proto_memory_pressure(struct proto *prot)
{
@@ -1332,13 +1524,32 @@ proto_memory_pressure(struct proto *prot)
#ifdef CONFIG_PROC_FS
-/* Called with local bh disabled */
-void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
+#define PROTO_INUSE_NR 64 /* should be enough for the first time */
+struct prot_inuse {
+ int all;
+ int val[PROTO_INUSE_NR];
+};
+
+static inline void sock_prot_inuse_add(const struct net *net,
+ const struct proto *prot, int val)
+{
+ this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
+}
+
+static inline void sock_inuse_add(const struct net *net, int val)
+{
+ this_cpu_add(net->core.prot_inuse->all, val);
+}
+
int sock_prot_inuse_get(struct net *net, struct proto *proto);
int sock_inuse_get(struct net *net);
#else
-static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
- int inc)
+static inline void sock_prot_inuse_add(const struct net *net,
+ const struct proto *prot, int val)
+{
+}
+
+static inline void sock_inuse_add(const struct net *net, int val)
{
}
#endif
@@ -1363,8 +1574,6 @@ static inline int __sk_prot_rehash(struct sock *sk)
#define RCV_SHUTDOWN 1
#define SEND_SHUTDOWN 2
-#define SOCK_SNDBUF_LOCK 1
-#define SOCK_RCVBUF_LOCK 2
#define SOCK_BINDADDR_LOCK 4
#define SOCK_BINDPORT_LOCK 8
@@ -1391,30 +1600,18 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);
-/* We used to have PAGE_SIZE here, but systems with 64KB pages
- * do not necessarily have 16x time more memory than 4KB ones.
- */
-#define SK_MEM_QUANTUM 4096
-#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND 0
#define SK_MEM_RECV 1
-/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
- long val = sk->sk_prot->sysctl_mem[index];
-
-#if PAGE_SIZE > SK_MEM_QUANTUM
- val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
-#elif PAGE_SIZE < SK_MEM_QUANTUM
- val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
-#endif
- return val;
+ return READ_ONCE(sk->sk_prot->sysctl_mem[index]);
}
static inline int sk_mem_pages(int amt)
{
- return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
+ return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}
static inline bool sk_has_account(struct sock *sk)
@@ -1425,36 +1622,56 @@ static inline bool sk_has_account(struct sock *sk)
static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
+ int delta;
+
if (!sk_has_account(sk))
return true;
- return size <= sk->sk_forward_alloc ||
- __sk_mem_schedule(sk, size, SK_MEM_SEND);
+ delta = size - sk->sk_forward_alloc;
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}
static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
+ int delta;
+
if (!sk_has_account(sk))
return true;
- return size<= sk->sk_forward_alloc ||
- __sk_mem_schedule(sk, size, SK_MEM_RECV) ||
+ delta = size - sk->sk_forward_alloc;
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
skb_pfmemalloc(skb);
}
+static inline int sk_unused_reserved_mem(const struct sock *sk)
+{
+ int unused_mem;
+
+ if (likely(!sk->sk_reserved_mem))
+ return 0;
+
+ unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued -
+ atomic_read(&sk->sk_rmem_alloc);
+
+ return unused_mem > 0 ? unused_mem : 0;
+}
+
static inline void sk_mem_reclaim(struct sock *sk)
{
+ int reclaimable;
+
if (!sk_has_account(sk))
return;
- if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
- __sk_mem_reclaim(sk, sk->sk_forward_alloc);
+
+ reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
+
+ if (reclaimable >= (int)PAGE_SIZE)
+ __sk_mem_reclaim(sk, reclaimable);
}
-static inline void sk_mem_reclaim_partial(struct sock *sk)
+static inline void sk_mem_reclaim_final(struct sock *sk)
{
- if (!sk_has_account(sk))
- return;
- if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
- __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1);
+ sk->sk_reserved_mem = 0;
+ sk_mem_reclaim(sk);
}
static inline void sk_mem_charge(struct sock *sk, int size)
@@ -1469,42 +1686,7 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
if (!sk_has_account(sk))
return;
sk->sk_forward_alloc += size;
-
- /* Avoid a possible overflow.
- * TCP send queues can make this happen, if sk_mem_reclaim()
- * is not called and more than 2 GBytes are released at once.
- *
- * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
- * no need to hold that much forward allocation anyway.
- */
- if (unlikely(sk->sk_forward_alloc >= 1 << 21))
- __sk_mem_reclaim(sk, 1 << 20);
-}
-
-DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
-static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
-{
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
- sk_wmem_queued_add(sk, -skb->truesize);
- sk_mem_uncharge(sk, skb->truesize);
- if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
- !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
- skb_ext_reset(skb);
- skb_zcopy_clear(skb, true);
- sk->sk_tx_skb_cache = skb;
- return;
- }
- __kfree_skb(skb);
-}
-
-static inline void sock_release_ownership(struct sock *sk)
-{
- if (sk->sk_lock.owned) {
- sk->sk_lock.owned = 0;
-
- /* The sk_lock has mutex_unlock() semantics: */
- mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
- }
+ sk_mem_reclaim(sk);
}
/*
@@ -1526,13 +1708,11 @@ do { \
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
} while (0)
-#ifdef CONFIG_LOCKDEP
static inline bool lockdep_sock_is_held(const struct sock *sk)
{
return lockdep_is_held(&sk->sk_lock) ||
lockdep_is_held(&sk->sk_lock.slock);
}
-#endif
void lock_sock_nested(struct sock *sk, int subclass);
@@ -1541,6 +1721,7 @@ static inline void lock_sock(struct sock *sk)
lock_sock_nested(sk, 0);
}
+void __lock_sock(struct sock *sk);
void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);
@@ -1551,7 +1732,37 @@ void release_sock(struct sock *sk);
SINGLE_DEPTH_NESTING)
#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock))
-bool lock_sock_fast(struct sock *sk);
+bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock);
+
+/**
+ * lock_sock_fast - fast version of lock_sock
+ * @sk: socket
+ *
+ * This version should be used for very small section, where process wont block
+ * return false if fast path is taken:
+ *
+ * sk_lock.slock locked, owned = 0, BH disabled
+ *
+ * return true if slow path is taken:
+ *
+ * sk_lock.slock unlocked, owned = 1, BH enabled
+ */
+static inline bool lock_sock_fast(struct sock *sk)
+{
+ /* The sk_lock has mutex_lock() semantics here. */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+
+ return __lock_sock_fast(sk);
+}
+
+/* fast socket lock variant for caller already holding a [different] socket lock */
+static inline bool lock_sock_fast_nested(struct sock *sk)
+{
+ mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_);
+
+ return __lock_sock_fast(sk);
+}
+
/**
* unlock_sock_fast - complement of lock_sock_fast
* @sk: socket
@@ -1561,13 +1772,22 @@ bool lock_sock_fast(struct sock *sk);
* If slow mode is on, we call regular release_sock()
*/
static inline void unlock_sock_fast(struct sock *sk, bool slow)
+ __releases(&sk->sk_lock.slock)
{
- if (slow)
+ if (slow) {
release_sock(sk);
- else
+ __release(&sk->sk_lock.slock);
+ } else {
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
spin_unlock_bh(&sk->sk_lock.slock);
+ }
}
+void sockopt_lock_sock(struct sock *sk);
+void sockopt_release_sock(struct sock *sk);
+bool sockopt_ns_capable(struct user_namespace *ns, int cap);
+bool sockopt_capable(int cap);
+
/* Used by processes to "lock" a socket state, so that
* interrupts and bottom half handlers won't change it
* from under us. It essentially blocks any incoming
@@ -1600,12 +1820,23 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
return sk->sk_lock.owned;
}
+static inline void sock_release_ownership(struct sock *sk)
+{
+ if (sock_owned_by_user_nocheck(sk)) {
+ sk->sk_lock.owned = 0;
+
+ /* The sk_lock has mutex_unlock() semantics: */
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
+ }
+}
+
/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
struct sock *sk = (struct sock *)csk;
- return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock);
+ return !sock_owned_by_user_nocheck(sk) &&
+ !spin_is_locked(&sk->sk_lock.slock);
}
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
@@ -1626,27 +1857,45 @@ void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
+void sock_pfree(struct sk_buff *skb);
#else
#define sock_edemux sock_efree
#endif
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen);
int sock_setsockopt(struct socket *sock, int level, int op,
- char __user *optval, unsigned int optlen);
+ sockptr_t optval, unsigned int optlen);
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen);
int sock_getsockopt(struct socket *sock, int level, int op,
char __user *optval, int __user *optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
bool timeval, bool time32);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
- int noblock, int *errcode);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
unsigned long data_len, int noblock,
int *errcode, int max_page_order);
+
+static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
+ unsigned long size,
+ int noblock, int *errcode)
+{
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
+}
+
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);
+static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
+{
+ if (sk->sk_socket)
+ clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ WRITE_ONCE(sk->sk_prot, proto);
+}
+
struct sockcm_cookie {
u64 transmit_time;
u32 mark;
@@ -1676,8 +1925,6 @@ int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
-int sock_no_getsockopt(struct socket *, int , int, char __user *, int __user *);
-int sock_no_setsockopt(struct socket *, int, int, char __user *, unsigned int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
@@ -1697,11 +1944,7 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen);
-int compat_sock_common_getsockopt(struct socket *sock, int level,
- int optname, char __user *optval, int __user *optlen);
-int compat_sock_common_setsockopt(struct socket *sock, int level,
- int optname, char __user *optval, unsigned int optlen);
+ sockptr_t optval, unsigned int optlen);
void sk_common_release(struct sock *sk);
@@ -1779,40 +2022,54 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return -1;
}
-static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
+static inline void __sk_rx_queue_set(struct sock *sk,
+ const struct sk_buff *skb,
+ bool force_set)
{
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
if (skb_rx_queue_recorded(skb)) {
u16 rx_queue = skb_get_rx_queue(skb);
- if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
- return;
-
- sk->sk_rx_queue_mapping = rx_queue;
+ if (force_set ||
+ unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
+ WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
}
#endif
}
+static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
+{
+ __sk_rx_queue_set(sk, skb, true);
+}
+
+static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb)
+{
+ __sk_rx_queue_set(sk, skb, false);
+}
+
static inline void sk_rx_queue_clear(struct sock *sk)
{
-#ifdef CONFIG_XPS
- sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}
-#ifdef CONFIG_XPS
static inline int sk_rx_queue_get(const struct sock *sk)
{
- if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
- return sk->sk_rx_queue_mapping;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ if (sk) {
+ int res = READ_ONCE(sk->sk_rx_queue_mapping);
+
+ if (res != NO_QUEUE_MAPPING)
+ return res;
+ }
+#endif
return -1;
}
-#endif
static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
- sk_tx_queue_clear(sk);
sk->sk_socket = sock;
}
@@ -1859,20 +2116,24 @@ static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
static inline u32 net_tx_rndhash(void)
{
- u32 v = prandom_u32();
+ u32 v = get_random_u32();
return v ?: 1;
}
static inline void sk_set_txhash(struct sock *sk)
{
- sk->sk_txhash = net_tx_rndhash();
+ /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
+ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}
-static inline void sk_rethink_txhash(struct sock *sk)
+static inline bool sk_rethink_txhash(struct sock *sk)
{
- if (sk->sk_txhash)
+ if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
sk_set_txhash(sk);
+ return true;
+ }
+ return false;
}
static inline struct dst_entry *
@@ -1895,12 +2156,10 @@ sk_dst_get(struct sock *sk)
return dst;
}
-static inline void dst_negative_advice(struct sock *sk)
+static inline void __dst_negative_advice(struct sock *sk)
{
struct dst_entry *ndst, *dst = __sk_dst_get(sk);
- sk_rethink_txhash(sk);
-
if (dst && dst->ops->negative_advice) {
ndst = dst->ops->negative_advice(dst);
@@ -1912,6 +2171,12 @@ static inline void dst_negative_advice(struct sock *sk)
}
}
+static inline void dst_negative_advice(struct sock *sk)
+{
+ sk_rethink_txhash(sk);
+ __dst_negative_advice(sk);
+}
+
static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
@@ -1962,13 +2227,10 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
if (skb_get_dst_pending_confirm(skb)) {
struct sock *sk = skb->sk;
- unsigned long now = jiffies;
- /* avoid dirtying neighbour */
- if (READ_ONCE(n->confirmed) != now)
- WRITE_ONCE(n->confirmed, now);
if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
+ neigh_confirm(n);
}
}
@@ -1981,10 +2243,10 @@ static inline bool sk_can_gso(const struct sock *sk)
void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
-static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags)
+static inline void sk_gso_disable(struct sock *sk)
{
- sk->sk_route_nocaps |= flags;
- sk->sk_route_caps &= ~flags;
+ sk->sk_gso_disabled = 1;
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
}
static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
@@ -2030,9 +2292,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
if (err)
return err;
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
+ skb_len_add(skb, copy);
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
return 0;
@@ -2131,9 +2391,12 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock,
static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
- if (sk->sk_txhash) {
+ /* This pairs with WRITE_ONCE() in sk_set_txhash() */
+ u32 txhash = READ_ONCE(sk->sk_txhash);
+
+ if (txhash) {
skb->l4_hash = 1;
- skb->hash = sk->sk_txhash;
+ skb->hash = txhash;
}
}
@@ -2156,17 +2419,46 @@ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
sk_mem_charge(sk, skb->truesize);
}
+static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
+{
+ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb_orphan(skb);
+ skb->destructor = sock_efree;
+ skb->sk = sk;
+ return true;
+ }
+ return false;
+}
+
+static inline void skb_prepare_for_gro(struct sk_buff *skb)
+{
+ if (skb->destructor != sock_wfree) {
+ skb_orphan(skb);
+ return;
+ }
+ skb->slow_gro = 1;
+}
+
void sk_reset_timer(struct sock *sk, struct timer_list *timer,
unsigned long expires);
void sk_stop_timer(struct sock *sk, struct timer_list *timer);
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);
+
int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb, unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason);
+
+static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ return sock_queue_rcv_skb_reason(sk, skb, NULL);
+}
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);
@@ -2178,12 +2470,19 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk);
static inline int sock_error(struct sock *sk)
{
int err;
- if (likely(!sk->sk_err))
+
+ /* Avoid an atomic operation for the common case.
+ * This is racy since another cpu/thread can change sk_err under us.
+ */
+ if (likely(data_race(!sk->sk_err)))
return 0;
+
err = xchg(&sk->sk_err, 0);
return -err;
}
+void sk_error_report(struct sock *sk);
+
static inline unsigned long sock_wspace(struct sock *sk)
{
int amt = 0;
@@ -2245,31 +2544,32 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
return;
val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
+ val = max_t(u32, val, sk_unused_reserved_mem(sk));
WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
- bool force_schedule);
-
/**
* sk_page_frag - return an appropriate page_frag
* @sk: socket
*
* Use the per task page_frag instead of the per socket one for
- * optimization when we know that we're in the normal context and owns
+ * optimization when we know that we're in process context and own
* everything that's associated with %current.
*
- * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
- * inside other socket operations and end up recursing into sk_page_frag()
- * while it's already in use.
+ * Both direct reclaim and page faults can nest inside other
+ * socket operations and end up recursing into sk_page_frag()
+ * while it's already in use: explicitly avoid task page_frag
+ * usage if the caller is potentially doing any of them.
+ * This assumes that page fault handlers use the GFP_NOFS flags.
*
* Return: a per task page_frag if context allows that,
* otherwise a per socket one.
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
- if (gfpflags_normal_context(sk->sk_allocation))
+ if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
+ (__GFP_DIRECT_RECLAIM | __GFP_FS))
return &current->task_frag;
return &sk->sk_frag;
@@ -2290,6 +2590,11 @@ static inline gfp_t gfp_any(void)
return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}
+static inline gfp_t gfp_memcg_charge(void)
+{
+ return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
+}
+
static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
return noblock ? 0 : sk->sk_rcvtimeo;
@@ -2404,20 +2709,21 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
__sock_recv_wifi_status(msg, sk, skb);
}
-void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
- struct sk_buff *skb);
+void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb);
#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
-static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
- struct sk_buff *skb)
+static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
{
-#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \
- (1UL << SOCK_RCVTSTAMP))
+#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVMARK))
#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
SOF_TIMESTAMPING_RAW_HARDWARE)
- if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
- __sock_recv_ts_and_drops(msg, sk, skb);
+ if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY)
+ __sock_recv_cmsgs(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
sock_write_timestamp(sk, skb->tstamp);
else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
@@ -2442,7 +2748,7 @@ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
__sock_tx_timestamp(tsflags, tx_flags);
if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
- *tskey = sk->sk_tskey++;
+ *tskey = atomic_inc_return(&sk->sk_tskey) - 1;
}
if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
*tx_flags |= SKBTX_WIFI_STATUS;
@@ -2460,7 +2766,11 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
&skb_shinfo(skb)->tskey);
}
-DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
+static inline bool sk_is_tcp(const struct sock *sk)
+{
+ return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP;
+}
+
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
@@ -2472,47 +2782,56 @@ DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
- if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
- !sk->sk_rx_skb_cache) {
- sk->sk_rx_skb_cache = skb;
- skb_orphan(skb);
- return;
- }
__kfree_skb(skb);
}
-static inline
-struct net *sock_net(const struct sock *sk)
+static inline bool
+skb_sk_is_prefetched(struct sk_buff *skb)
{
- return read_pnet(&sk->sk_net);
+#ifdef CONFIG_INET
+ return skb->destructor == sock_pfree;
+#else
+ return false;
+#endif /* CONFIG_INET */
}
-static inline
-void sock_net_set(struct sock *sk, struct net *net)
+/* This helper checks if a socket is a full socket,
+ * ie _not_ a timewait or request socket.
+ */
+static inline bool sk_fullsock(const struct sock *sk)
{
- write_pnet(&sk->sk_net, net);
+ return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
+}
+
+static inline bool
+sk_is_refcounted(struct sock *sk)
+{
+ /* Only full sockets have sk->sk_flags. */
+ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}
-static inline struct sock *skb_steal_sock(struct sk_buff *skb)
+/**
+ * skb_steal_sock - steal a socket from an sk_buff
+ * @skb: sk_buff to steal the socket from
+ * @refcounted: is set to true if the socket is reference-counted
+ */
+static inline struct sock *
+skb_steal_sock(struct sk_buff *skb, bool *refcounted)
{
if (skb->sk) {
struct sock *sk = skb->sk;
+ *refcounted = true;
+ if (skb_sk_is_prefetched(skb))
+ *refcounted = sk_is_refcounted(sk);
skb->destructor = NULL;
skb->sk = NULL;
return sk;
}
+ *refcounted = false;
return NULL;
}
-/* This helper checks if a socket is a full socket,
- * ie _not_ a timewait or request socket.
- */
-static inline bool sk_fullsock(const struct sock *sk)
-{
- return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
-}
-
/* Checks if this SKB belongs to an HW offloaded socket
* and whether any SW fallbacks are required based on dev.
* Check decrypted mark in case skb_orphan() cleared socket.
@@ -2575,24 +2894,25 @@ extern int sysctl_optmem_max;
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_wmem ? */
if (proto->sysctl_wmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));
- return *proto->sysctl_wmem;
+ return READ_ONCE(*proto->sysctl_wmem);
}
static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_rmem ? */
if (proto->sysctl_rmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));
- return *proto->sysctl_rmem;
+ return READ_ONCE(*proto->sysctl_rmem);
}
/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
@@ -2613,13 +2933,14 @@ static inline void sk_pacing_shift_update(struct sock *sk, int val)
*/
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
int mdif;
- if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)
+ if (!bound_dev_if || bound_dev_if == dif)
return true;
mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
- if (mdif && mdif == sk->sk_bound_dev_if)
+ if (mdif && mdif == bound_dev_if)
return true;
return false;
@@ -2627,4 +2948,31 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
void sock_def_readable(struct sock *sk);
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
+void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
+int sock_set_timestamping(struct sock *sk, int optname,
+ struct so_timestamping timestamping);
+
+void sock_enable_timestamps(struct sock *sk);
+void sock_no_linger(struct sock *sk);
+void sock_set_keepalive(struct sock *sk);
+void sock_set_priority(struct sock *sk, u32 priority);
+void sock_set_rcvbuf(struct sock *sk, int val);
+void sock_set_mark(struct sock *sk, u32 val);
+void sock_set_reuseaddr(struct sock *sk);
+void sock_set_reuseport(struct sock *sk);
+void sock_set_sndtimeo(struct sock *sk, s64 secs);
+
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
+
+int sock_get_timeout(long timeo, void *optval, bool old_timeval);
+int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
+ sockptr_t optval, int optlen, bool old_timeval);
+
+static inline bool sk_is_readable(struct sock *sk)
+{
+ if (sk->sk_prot->sock_is_readable)
+ return sk->sk_prot->sock_is_readable(sk);
+ return false;
+}
#endif /* _SOCK_H */