aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/sock.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/sock.c')
-rw-r--r--net/core/sock.c1488
1 files changed, 984 insertions, 504 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index 8f71684305c3..a3ba0358c77c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -113,6 +113,7 @@
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
@@ -138,10 +139,15 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
+#include <linux/ethtool.h>
+
+#include "dev.h"
+
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
-static void sock_inuse_add(struct net *net, int val);
+static void sock_def_write_space_wfree(struct sock *sk);
+static void sock_def_write_space(struct sock *sk);
/**
* sk_ns_capable - General socket capability test
@@ -223,6 +229,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MCTP" , \
x "AF_MAX"
static const char *const af_family_key_strings[AF_MAX+1] = {
@@ -323,14 +330,33 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
noreclaim_flag = memalloc_noreclaim_save();
- ret = sk->sk_backlog_rcv(sk, skb);
+ ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
+ tcp_v6_do_rcv,
+ tcp_v4_do_rcv,
+ sk, skb);
memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);
-static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
+void sk_error_report(struct sock *sk)
+{
+ sk->sk_error_report(sk);
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ fallthrough;
+ case AF_INET6:
+ trace_inet_sk_error_report(sk);
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL(sk_error_report);
+
+int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
struct __kernel_sock_timeval tv;
@@ -359,36 +385,50 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
*(struct __kernel_sock_timeval *)optval = tv;
return sizeof(tv);
}
+EXPORT_SYMBOL(sock_get_timeout);
-static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
+int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
+ sockptr_t optval, int optlen, bool old_timeval)
{
- struct __kernel_sock_timeval tv;
-
if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
struct old_timeval32 tv32;
if (optlen < sizeof(tv32))
return -EINVAL;
- if (copy_from_user(&tv32, optval, sizeof(tv32)))
+ if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
return -EFAULT;
- tv.tv_sec = tv32.tv_sec;
- tv.tv_usec = tv32.tv_usec;
+ tv->tv_sec = tv32.tv_sec;
+ tv->tv_usec = tv32.tv_usec;
} else if (old_timeval) {
struct __kernel_old_timeval old_tv;
if (optlen < sizeof(old_tv))
return -EINVAL;
- if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
+ if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
return -EFAULT;
- tv.tv_sec = old_tv.tv_sec;
- tv.tv_usec = old_tv.tv_usec;
+ tv->tv_sec = old_tv.tv_sec;
+ tv->tv_usec = old_tv.tv_usec;
} else {
- if (optlen < sizeof(tv))
+ if (optlen < sizeof(*tv))
return -EINVAL;
- if (copy_from_user(&tv, optval, sizeof(tv)))
+ if (copy_from_sockptr(tv, optval, sizeof(*tv)))
return -EFAULT;
}
+
+ return 0;
+}
+EXPORT_SYMBOL(sock_copy_user_timeval);
+
+static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
+ bool old_timeval)
+{
+ struct __kernel_sock_timeval tv;
+ int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
+
+ if (err)
+ return err;
+
if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
return -EDOM;
@@ -411,18 +451,6 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool
return 0;
}
-static void sock_warn_obsolete_bsdism(const char *name)
-{
- static int warned;
- static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
- warncomm, name);
- warned++;
- }
-}
-
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -480,17 +508,35 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
{
+ enum skb_drop_reason drop_reason;
int err;
err = sk_filter(sk, skb);
- if (err)
- return err;
-
- return __sock_queue_rcv_skb(sk, skb);
+ if (err) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+ goto out;
+ }
+ err = __sock_queue_rcv_skb(sk, skb);
+ switch (err) {
+ case -ENOMEM:
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ break;
+ case -ENOBUFS:
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ break;
+ default:
+ drop_reason = SKB_NOT_DROPPED_YET;
+ break;
+ }
+out:
+ if (reason)
+ *reason = drop_reason;
+ return err;
}
-EXPORT_SYMBOL(sock_queue_rcv_skb);
+EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
const int nested, unsigned int trim_cap, bool refcounted)
@@ -536,11 +582,17 @@ discard_and_relse:
}
EXPORT_SYMBOL(__sk_receive_skb);
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
+ u32));
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = __sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && dst->obsolete &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
@@ -556,7 +608,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && dst->obsolete &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_dst_reset(sk);
dst_release(dst);
return NULL;
@@ -566,7 +620,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
}
EXPORT_SYMBOL(sk_dst_check);
-static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -574,14 +628,16 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
/* Sorry... */
ret = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
if (ifindex < 0)
goto out;
- sk->sk_bound_dev_if = ifindex;
+ /* Paired with all READ_ONCE() done locklessly. */
+ WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
+
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
@@ -594,8 +650,21 @@ out:
return ret;
}
-static int sock_setbindtodevice(struct sock *sk, char __user *optval,
- int optlen)
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
+{
+ int ret;
+
+ if (lock_sk)
+ lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, ifindex);
+ if (lock_sk)
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
+static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -617,7 +686,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
memset(devname, 0, sizeof(devname));
ret = -EFAULT;
- if (copy_from_user(devname, optval, optlen))
+ if (copy_from_sockptr(devname, optval, optlen))
goto out;
index = 0;
@@ -634,25 +703,25 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
goto out;
}
- lock_sock(sk);
- ret = sock_setbindtodevice_locked(sk, index);
- release_sock(sk);
-
+ sockopt_lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, index);
+ sockopt_release_sock(sk);
out:
#endif
return ret;
}
-static int sock_getbindtodevice(struct sock *sk, char __user *optval,
- int __user *optlen, int len)
+static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
struct net *net = sock_net(sk);
char devname[IFNAMSIZ];
- if (sk->sk_bound_dev_if == 0) {
+ if (bound_dev_if == 0) {
len = 0;
goto zero;
}
@@ -661,19 +730,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
if (len < IFNAMSIZ)
goto out;
- ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ ret = netdev_get_name(net, devname, bound_dev_if);
if (ret)
goto out;
len = strlen(devname) + 1;
ret = -EFAULT;
- if (copy_to_user(optval, devname, len))
+ if (copy_to_sockptr(optval, devname, len))
goto out;
zero:
ret = -EFAULT;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
goto out;
ret = 0;
@@ -684,15 +753,6 @@ out:
return ret;
}
-static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
- int valbool)
-{
- if (valbool)
- sock_set_flag(sk, bit);
- else
- sock_reset_flag(sk, bit);
-}
-
bool sk_mc_loop(struct sock *sk)
{
if (dev_recursion_level())
@@ -707,21 +767,322 @@ bool sk_mc_loop(struct sock *sk)
return inet6_sk(sk)->mc_loop;
#endif
}
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return true;
}
EXPORT_SYMBOL(sk_mc_loop);
+void sock_set_reuseaddr(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuse = SK_CAN_REUSE;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
+void sock_set_reuseport(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuseport = true;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
+void sock_no_linger(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_lingertime = 0;
+ sock_set_flag(sk, SOCK_LINGER);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_no_linger);
+
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+ lock_sock(sk);
+ sk->sk_priority = priority;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
+void sock_set_sndtimeo(struct sock *sk, s64 secs)
+{
+ lock_sock(sk);
+ if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+ sk->sk_sndtimeo = secs * HZ;
+ else
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+ if (val) {
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ }
+}
+
+void sock_enable_timestamps(struct sock *sk)
+{
+ lock_sock(sk);
+ __sock_set_timestamps(sk, true, false, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_enable_timestamps);
+
+void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
+{
+ switch (optname) {
+ case SO_TIMESTAMP_OLD:
+ __sock_set_timestamps(sk, valbool, false, false);
+ break;
+ case SO_TIMESTAMP_NEW:
+ __sock_set_timestamps(sk, valbool, true, false);
+ break;
+ case SO_TIMESTAMPNS_OLD:
+ __sock_set_timestamps(sk, valbool, false, true);
+ break;
+ case SO_TIMESTAMPNS_NEW:
+ __sock_set_timestamps(sk, valbool, true, true);
+ break;
+ }
+}
+
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
+ bool match = false;
+ int *vclock_index;
+ int i, num;
+
+ if (sk->sk_bound_dev_if)
+ dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+ if (!dev) {
+ pr_err("%s: sock not bind to device\n", __func__);
+ return -EOPNOTSUPP;
+ }
+
+ num = ethtool_get_phc_vclocks(dev, &vclock_index);
+ dev_put(dev);
+
+ for (i = 0; i < num; i++) {
+ if (*(vclock_index + i) == phc_index) {
+ match = true;
+ break;
+ }
+ }
+
+ if (num > 0)
+ kfree(vclock_index);
+
+ if (!match)
+ return -EINVAL;
+
+ sk->sk_bind_phc = phc_index;
+
+ return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+ struct so_timestamping timestamping)
+{
+ int val = timestamping.flags;
+ int ret;
+
+ if (val & ~SOF_TIMESTAMPING_MASK)
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk_is_tcp(sk)) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))
+ return -EINVAL;
+ atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
+ } else {
+ atomic_set(&sk->sk_tskey, 0);
+ }
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY))
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_BIND_PHC) {
+ ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+ if (ret)
+ return ret;
+ }
+
+ sk->sk_tsflags = val;
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ return 0;
+}
+
+void sock_set_keepalive(struct sock *sk)
+{
+ lock_sock(sk);
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, true);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
+static void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+ /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+ * as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+ /* We double it on the way in to account for "struct sk_buff" etc.
+ * overhead. Applications assume that the SO_RCVBUF setting they make
+ * will allow that much actual data to be received on that socket.
+ *
+ * Applications are unaware that "struct sk_buff" and other overheads
+ * allocate from the receive buffer during socket buffer allocation.
+ *
+ * And after considering the possible alternatives, returning the value
+ * we actually used in getsockopt is the most desirable behavior.
+ */
+ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __sock_set_rcvbuf(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
+static void __sock_set_mark(struct sock *sk, u32 val)
+{
+ if (val != sk->sk_mark) {
+ sk->sk_mark = val;
+ sk_dst_reset(sk);
+ }
+}
+
+void sock_set_mark(struct sock *sk, u32 val)
+{
+ lock_sock(sk);
+ __sock_set_mark(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_mark);
+
+static void sock_release_reserved_memory(struct sock *sk, int bytes)
+{
+ /* Round down bytes to multiple of pages */
+ bytes = round_down(bytes, PAGE_SIZE);
+
+ WARN_ON(bytes > sk->sk_reserved_mem);
+ sk->sk_reserved_mem -= bytes;
+ sk_mem_reclaim(sk);
+}
+
+static int sock_reserve_memory(struct sock *sk, int bytes)
+{
+ long allocated;
+ bool charged;
+ int pages;
+
+ if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
+ return -EOPNOTSUPP;
+
+ if (!bytes)
+ return 0;
+
+ pages = sk_mem_pages(bytes);
+
+ /* pre-charge to memcg */
+ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!charged)
+ return -ENOMEM;
+
+ /* pre-charge to forward_alloc */
+ sk_memory_allocated_add(sk, pages);
+ allocated = sk_memory_allocated(sk);
+ /* If the system goes into memory pressure with this
+ * precharge, give up and return error.
+ */
+ if (allocated > sk_prot_mem_limits(sk, 1)) {
+ sk_memory_allocated_sub(sk, pages);
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
+ return -ENOMEM;
+ }
+ sk->sk_forward_alloc += pages << PAGE_SHIFT;
+
+ sk->sk_reserved_mem += pages << PAGE_SHIFT;
+
+ return 0;
+}
+
+void sockopt_lock_sock(struct sock *sk)
+{
+ /* When current->bpf_ctx is set, the setsockopt is called from
+ * a bpf prog. bpf has ensured the sk lock has been
+ * acquired before calling setsockopt().
+ */
+ if (has_current_bpf_ctx())
+ return;
+
+ lock_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_lock_sock);
+
+void sockopt_release_sock(struct sock *sk)
+{
+ if (has_current_bpf_ctx())
+ return;
+
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_release_sock);
+
+bool sockopt_ns_capable(struct user_namespace *ns, int cap)
+{
+ return has_current_bpf_ctx() || ns_capable(ns, cap);
+}
+EXPORT_SYMBOL(sockopt_ns_capable);
+
+bool sockopt_capable(int cap)
+{
+ return has_current_bpf_ctx() || capable(cap);
+}
+EXPORT_SYMBOL(sockopt_capable);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
-int sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
+ struct so_timestamping timestamping;
+ struct socket *sock = sk->sk_socket;
struct sock_txtime sk_txtime;
- struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
@@ -737,16 +1098,16 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
valbool = val ? 1 : 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case SO_DEBUG:
- if (val && !capable(CAP_NET_ADMIN))
+ if (val && !sockopt_capable(CAP_NET_ADMIN))
ret = -EACCES;
else
sock_valbool_flag(sk, SOCK_DBG, valbool);
@@ -776,7 +1137,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_wmem_max);
+ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
/* Ensure val * 2 fits into an int, to prevent max_t()
* from treating it as a negative value.
@@ -790,7 +1151,7 @@ set_sndbuf:
break;
case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -808,34 +1169,11 @@ set_sndbuf:
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_rmem_max);
-set_rcvbuf:
- /* Ensure val * 2 fits into an int, to prevent max_t()
- * from treating it as a negative value.
- */
- val = min_t(int, val, INT_MAX / 2);
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- /*
- * We double it on the way in to account for
- * "struct sk_buff" etc. overhead. Applications
- * assume that the SO_RCVBUF setting they make will
- * allow that much actual data to be received on that
- * socket.
- *
- * Applications are unaware that "struct sk_buff" and
- * other overheads allocate from the receive buffer
- * during socket buffer allocation.
- *
- * And after considering the possible alternatives,
- * returning the value we actually used in getsockopt
- * is the most desirable behavior.
- */
- WRITE_ONCE(sk->sk_rcvbuf,
- max_t(int, val * 2, SOCK_MIN_RCVBUF));
+ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
break;
case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -843,9 +1181,8 @@ set_rcvbuf:
/* No negative values (to prevent underflow, as val will be
* multiplied by 2).
*/
- if (val < 0)
- val = 0;
- goto set_rcvbuf;
+ __sock_set_rcvbuf(sk, max(val, 0));
+ break;
case SO_KEEPALIVE:
if (sk->sk_prot->keepalive)
@@ -863,7 +1200,8 @@ set_rcvbuf:
case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
ret = -EPERM;
@@ -874,7 +1212,7 @@ set_rcvbuf:
ret = -EINVAL; /* 1003.1g */
break;
}
- if (copy_from_user(&ling, optval, sizeof(ling))) {
+ if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
ret = -EFAULT;
break;
}
@@ -892,7 +1230,6 @@ set_rcvbuf:
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
break;
case SO_PASSCRED:
@@ -906,72 +1243,28 @@ set_rcvbuf:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
case SO_TIMESTAMPNS_NEW:
- if (valbool) {
- if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
- sock_set_flag(sk, SOCK_TSTAMP_NEW);
- else
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
- if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- else
- sock_set_flag(sk, SOCK_RCVTSTAMPNS);
- sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- } else {
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
- }
+ sock_set_timestamp(sk, optname, valbool);
break;
case SO_TIMESTAMPING_NEW:
- sock_set_flag(sk, SOCK_TSTAMP_NEW);
- /* fall through */
case SO_TIMESTAMPING_OLD:
- if (val & ~SOF_TIMESTAMPING_MASK) {
- ret = -EINVAL;
- break;
- }
-
- if (val & SOF_TIMESTAMPING_OPT_ID &&
- !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
- if ((1 << sk->sk_state) &
- (TCPF_CLOSE | TCPF_LISTEN)) {
- ret = -EINVAL;
- break;
- }
- sk->sk_tskey = tcp_sk(sk)->snd_una;
- } else {
- sk->sk_tskey = 0;
+ if (optlen == sizeof(timestamping)) {
+ if (copy_from_sockptr(&timestamping, optval,
+ sizeof(timestamping))) {
+ ret = -EFAULT;
+ break;
}
+ } else {
+ memset(&timestamping, 0, sizeof(timestamping));
+ timestamping.flags = val;
}
-
- if (val & SOF_TIMESTAMPING_OPT_STATS &&
- !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
- ret = -EINVAL;
- break;
- }
-
- sk->sk_tsflags = val;
- if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
- sock_enable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
- else {
- if (optname == SO_TIMESTAMPING_NEW)
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
- sock_disable_timestamp(sk,
- (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
- }
+ ret = sock_set_timestamping(sk, optname, timestamping);
break;
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- if (sock->ops->set_rcvlowat)
+ if (sock && sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
@@ -979,60 +1272,52 @@ set_rcvbuf:
case SO_RCVTIMEO_OLD:
case SO_RCVTIMEO_NEW:
- ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
+ optlen, optname == SO_RCVTIMEO_OLD);
break;
case SO_SNDTIMEO_OLD:
case SO_SNDTIMEO_NEW:
- ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
+ ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
+ optlen, optname == SO_SNDTIMEO_OLD);
break;
- case SO_ATTACH_FILTER:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
-
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
+ case SO_ATTACH_FILTER: {
+ struct sock_fprog fprog;
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_BPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_attach_bpf(ufd, sk);
}
break;
- case SO_ATTACH_REUSEPORT_CBPF:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
-
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
+ case SO_ATTACH_REUSEPORT_CBPF: {
+ struct sock_fprog fprog;
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_reuseport_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_REUSEPORT_EBPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_reuseport_attach_bpf(ufd, sk);
@@ -1061,12 +1346,22 @@ set_rcvbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+
+ __sock_set_mark(sk, val);
+ break;
+ case SO_RCVMARK:
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
- } else if (val != sk->sk_mark) {
- sk->sk_mark = val;
- sk_dst_reset(sk);
+ break;
}
+
+ sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
break;
case SO_RXQ_OVFL:
@@ -1095,24 +1390,40 @@ set_rcvbuf:
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
/* allow unprivileged users to decrease the value */
- if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else {
if (val < 0)
ret = -EINVAL;
else
- sk->sk_ll_usec = val;
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ }
+ break;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ break;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ } else {
+ if (val < 0 || val > U16_MAX)
+ ret = -EINVAL;
+ else
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
}
break;
#endif
case SO_MAX_PACING_RATE:
{
- unsigned long ulval = (val == ~0U) ? ~0UL : val;
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
- get_user(ulval, (unsigned long __user *)optval)) {
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
ret = -EFAULT;
break;
}
@@ -1135,13 +1446,12 @@ set_rcvbuf:
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (!((sk->sk_type == SOCK_STREAM &&
- sk->sk_protocol == IPPROTO_TCP) ||
+ if (!(sk_is_tcp(sk) ||
(sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP)))
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
} else if (sk->sk_family != PF_RDS) {
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
}
if (!ret) {
if (val < 0 || val > 1)
@@ -1152,38 +1462,98 @@ set_rcvbuf:
break;
case SO_TXTIME:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else if (optlen != sizeof(struct sock_txtime)) {
+ if (optlen != sizeof(struct sock_txtime)) {
ret = -EINVAL;
- } else if (copy_from_user(&sk_txtime, optval,
+ break;
+ } else if (copy_from_sockptr(&sk_txtime, optval,
sizeof(struct sock_txtime))) {
ret = -EFAULT;
+ break;
} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
ret = -EINVAL;
- } else {
- sock_valbool_flag(sk, SOCK_TXTIME, true);
- sk->sk_clockid = sk_txtime.clockid;
- sk->sk_txtime_deadline_mode =
- !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
- sk->sk_txtime_report_errors =
- !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
+ }
+ /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
+ * scheduler has enough safe guards.
+ */
+ if (sk_txtime.clockid != CLOCK_MONOTONIC &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
}
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
break;
case SO_BINDTOIFINDEX:
- ret = sock_setbindtodevice_locked(sk, val);
+ ret = sock_bindtoindex_locked(sk, val);
+ break;
+
+ case SO_BUF_LOCK:
+ if (val & ~SOCK_BUF_LOCK_MASK) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_userlocks = val | (sk->sk_userlocks &
+ ~SOCK_BUF_LOCK_MASK);
+ break;
+
+ case SO_RESERVE_MEM:
+ {
+ int delta;
+
+ if (val < 0) {
+ ret = -EINVAL;
+ break;
+ }
+
+ delta = val - sk->sk_reserved_mem;
+ if (delta < 0)
+ sock_release_reserved_memory(sk, -delta);
+ else
+ ret = sock_reserve_memory(sk, delta);
+ break;
+ }
+
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Paired with READ_ONCE() in tcp_rtx_synack() */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;
default:
ret = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return sk_setsockopt(sock->sk, level, optname,
+ optval, optlen);
+}
EXPORT_SYMBOL(sock_setsockopt);
+static const struct cred *sk_get_peer_cred(struct sock *sk)
+{
+ const struct cred *cred;
+
+ spin_lock(&sk->sk_peer_lock);
+ cred = get_cred(sk->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ return cred;
+}
static void cred_to_ucred(struct pid *pid, const struct cred *cred,
struct ucred *ucred)
@@ -1198,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
-static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
struct user_namespace *user_ns = current_user_ns();
int i;
- for (i = 0; i < src->ngroups; i++)
- if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ for (i = 0; i < src->ngroups; i++) {
+ gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
+
+ if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
return -EFAULT;
+ }
return 0;
}
-int sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
- struct sock *sk = sock->sk;
+ struct socket *sock = sk->sk_socket;
union {
int val;
@@ -1224,12 +1597,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
struct __kernel_old_timeval tm;
struct __kernel_sock_timeval stm;
struct sock_txtime txtime;
+ struct so_timestamping timestamping;
} v;
int lv = sizeof(int);
int len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
@@ -1306,7 +1680,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
break;
case SO_TIMESTAMP_OLD:
@@ -1328,7 +1701,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_TIMESTAMPING_OLD:
- v.val = sk->sk_tsflags;
+ lv = sizeof(v.timestamping);
+ v.timestamping.flags = sk->sk_tsflags;
+ v.timestamping.bind_phc = sk->sk_bind_phc;
break;
case SO_RCVTIMEO_OLD:
@@ -1358,28 +1733,35 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
struct ucred peercred;
if (len > sizeof(peercred))
len = sizeof(peercred);
+
+ spin_lock(&sk->sk_peer_lock);
cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
- if (copy_to_user(optval, &peercred, len))
+ spin_unlock(&sk->sk_peer_lock);
+
+ if (copy_to_sockptr(optval, &peercred, len))
return -EFAULT;
goto lenout;
}
case SO_PEERGROUPS:
{
+ const struct cred *cred;
int ret, n;
- if (!sk->sk_peer_cred)
+ cred = sk_get_peer_cred(sk);
+ if (!cred)
return -ENODATA;
- n = sk->sk_peer_cred->group_info->ngroups;
+ n = cred->group_info->ngroups;
if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
- return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ put_cred(cred);
+ return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
- ret = groups_to_user((gid_t __user *)optval,
- sk->sk_peer_cred->group_info);
+ ret = groups_to_user(optval, cred->group_info);
+ put_cred(cred);
if (ret)
return ret;
goto lenout;
@@ -1394,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return -ENOTCONN;
if (lv < len)
return -EINVAL;
- if (copy_to_user(optval, address, len))
+ if (copy_to_sockptr(optval, address, len))
return -EFAULT;
goto lenout;
}
@@ -1411,12 +1793,16 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PEERSEC:
- return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len);
case SO_MARK:
v.val = sk->sk_mark;
break;
+ case SO_RCVMARK:
+ v.val = sock_flag(sk, SOCK_RCVMARK);
+ break;
+
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL);
break;
@@ -1439,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return sock_getbindtodevice(sk, optval, optlen, len);
case SO_GET_FILTER:
- len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ len = sk_get_filter(sk, optval, len);
if (len < 0)
return len;
@@ -1461,6 +1847,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
+ case SO_PREFER_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+ break;
#endif
case SO_MAX_PACING_RATE:
@@ -1484,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
sk_get_meminfo(sk, meminfo);
len = min_t(unsigned int, len, sizeof(meminfo));
- if (copy_to_user(optval, &meminfo, len))
+ if (copy_to_sockptr(optval, &meminfo, len))
return -EFAULT;
goto lenout;
@@ -1522,7 +1911,26 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BINDTOIFINDEX:
- v.val = sk->sk_bound_dev_if;
+ v.val = READ_ONCE(sk->sk_bound_dev_if);
+ break;
+
+ case SO_NETNS_COOKIE:
+ lv = sizeof(u64);
+ if (len != lv)
+ return -EINVAL;
+ v.val64 = sock_net(sk)->net_cookie;
+ break;
+
+ case SO_BUF_LOCK:
+ v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
+ break;
+
+ case SO_RESERVE_MEM:
+ v.val = sk->sk_reserved_mem;
+ break;
+
+ case SO_TXREHASH:
+ v.val = sk->sk_txrehash;
break;
default:
@@ -1534,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len > lv)
len = lv;
- if (copy_to_user(optval, &v, len))
+ if (copy_to_sockptr(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return sk_getsockopt(sock->sk, level, optname,
+ USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
+}
+
/*
* Initialize an sk_lock.
*
@@ -1572,13 +1988,24 @@ static inline void sock_lock_init(struct sock *sk)
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
+ const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
+
+ /* If we move sk_tx_queue_mapping out of the private section,
+ * we must check if sk_tx_queue_clear() is called after
+ * sock_copy() in sk_clone_lock().
+ */
+ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+ offsetof(struct sock, sk_dontcopy_begin) ||
+ offsetof(struct sock, sk_tx_queue_mapping) >=
+ offsetof(struct sock, sk_dontcopy_end));
+
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
@@ -1608,7 +2035,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
- sk_tx_queue_clear(sk);
}
return sk;
@@ -1666,7 +2092,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt)) {
- get_net(net);
+ get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
}
@@ -1677,6 +2103,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
+ sk_tx_queue_clear(sk);
}
return sk;
@@ -1716,11 +2143,12 @@ static void __sk_destruct(struct rcu_head *head)
sk->sk_frag.page = NULL;
}
- if (sk->sk_peer_cred)
- put_cred(sk->sk_peer_cred);
+ /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
+ put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
+
if (likely(sk->sk_net_refcnt))
- put_net(sock_net(sk));
+ put_net_track(sock_net(sk), &sk->ns_tracker);
sk_prot_free(sk->sk_prot_creator, sk);
}
@@ -1792,116 +2220,122 @@ static void sk_init_common(struct sock *sk)
*/
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
- struct sock *newsk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct sk_filter *filter;
bool is_charged = true;
+ struct sock *newsk;
- newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
- if (newsk != NULL) {
- struct sk_filter *filter;
-
- sock_copy(newsk, sk);
-
- newsk->sk_prot_creator = sk->sk_prot;
+ newsk = sk_prot_alloc(prot, priority, sk->sk_family);
+ if (!newsk)
+ goto out;
- /* SANITY */
- if (likely(newsk->sk_net_refcnt))
- get_net(sock_net(newsk));
- sk_node_init(&newsk->sk_node);
- sock_lock_init(newsk);
- bh_lock_sock(newsk);
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_backlog.len = 0;
+ sock_copy(newsk, sk);
- atomic_set(&newsk->sk_rmem_alloc, 0);
- /*
- * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
- */
- refcount_set(&newsk->sk_wmem_alloc, 1);
- atomic_set(&newsk->sk_omem_alloc, 0);
- sk_init_common(newsk);
+ newsk->sk_prot_creator = prot;
- newsk->sk_dst_cache = NULL;
- newsk->sk_dst_pending_confirm = 0;
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
- atomic_set(&newsk->sk_drops, 0);
- newsk->sk_send_head = NULL;
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- atomic_set(&newsk->sk_zckey, 0);
+ /* SANITY */
+ if (likely(newsk->sk_net_refcnt)) {
+ get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
+ sock_inuse_add(sock_net(newsk), 1);
+ }
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
- sock_reset_flag(newsk, SOCK_DONE);
+ atomic_set(&newsk->sk_rmem_alloc, 0);
- /* sk->sk_memcg will be populated at accept() time */
- newsk->sk_memcg = NULL;
+ /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
+ refcount_set(&newsk->sk_wmem_alloc, 1);
- cgroup_sk_alloc(&newsk->sk_cgrp_data);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ sk_init_common(newsk);
- rcu_read_lock();
- filter = rcu_dereference(sk->sk_filter);
- if (filter != NULL)
- /* though it's an empty new sock, the charging may fail
- * if sysctl_optmem_max was changed between creation of
- * original socket and cloning
- */
- is_charged = sk_filter_charge(newsk, filter);
- RCU_INIT_POINTER(newsk->sk_filter, filter);
- rcu_read_unlock();
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_reserved_mem = 0;
+ atomic_set(&newsk->sk_drops, 0);
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* We need to make sure that we don't uncharge the new
- * socket if we couldn't charge it in the first place
- * as otherwise we uncharge the parent's filter.
- */
- if (!is_charged)
- RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
- RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+ sock_reset_flag(newsk, SOCK_DONE);
- if (bpf_sk_storage_clone(sk, newsk)) {
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
- newsk->sk_err = 0;
- newsk->sk_err_soft = 0;
- newsk->sk_priority = 0;
- newsk->sk_incoming_cpu = raw_smp_processor_id();
- if (likely(newsk->sk_net_refcnt))
- sock_inuse_add(sock_net(newsk), 1);
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
- /*
- * Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
*/
- smp_wmb();
- refcount_set(&newsk->sk_refcnt, 2);
+ is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
- /*
- * Increment the counter in the same struct proto as the master
- * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
- * is the same as sk->sk_prot->socks, as this field was copied
- * with memcpy).
- *
- * This _changes_ the previous behaviour, where
- * tcp_create_openreq_child always was incrementing the
- * equivalent to tcp_prot->socks (inet_sock_nr), so this have
- * to be taken into account in all callers. -acme
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
*/
- sk_refcnt_debug_inc(newsk);
- sk_set_socket(newsk, NULL);
- RCU_INIT_POINTER(newsk->sk_wq, NULL);
-
- if (newsk->sk_prot->sockets_allocated)
- sk_sockets_allocated_inc(newsk);
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
- if (sock_needs_netstamp(sk) &&
- newsk->sk_flags & SK_FLAGS_TIMESTAMP)
- net_enable_timestamp();
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
}
+
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ newsk->sk_user_data = NULL;
+
+ newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+
+ /* Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.rst for details)
+ */
+ smp_wmb();
+ refcount_set(&newsk->sk_refcnt, 2);
+
+ /* Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
+ RCU_INIT_POINTER(newsk->sk_wq, NULL);
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
out:
return newsk;
}
@@ -1917,22 +2351,42 @@ void sk_free_unlock_clone(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+static void sk_trim_gso_size(struct sock *sk)
+{
+ if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
+ return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ sk_is_tcp(sk) &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return;
+#endif
+ sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
+}
+
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
sk_dst_set(sk, dst);
- sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
+ sk->sk_route_caps = dst->dev->features;
+ if (sk_is_tcp(sk))
+ sk->sk_route_caps |= NETIF_F_GSO;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
- sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (unlikely(sk->sk_gso_disabled))
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
if (sk_can_gso(sk)) {
if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = dst->dev->gso_max_size;
- max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
+ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+ sk_trim_gso_size(sk);
+ sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
+ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
@@ -1951,8 +2405,20 @@ void sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
unsigned int len = skb->truesize;
+ bool free;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ if (sock_flag(sk, SOCK_RCU_FREE) &&
+ sk->sk_write_space == sock_def_write_space) {
+ rcu_read_lock();
+ free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
+ sock_def_write_space_wfree(sk);
+ rcu_read_unlock();
+ if (unlikely(free))
+ __sk_free(sk);
+ return;
+ }
+
/*
* Keep a reference on sk_wmem_alloc, this will be released
* after sk_write_space() call
@@ -2027,16 +2493,10 @@ void skb_orphan_partial(struct sk_buff *skb)
if (skb_is_tcp_pure_ack(skb))
return;
- if (can_skb_orphan_partial(skb)) {
- struct sock *sk = skb->sk;
+ if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
+ return;
- if (refcount_inc_not_zero(&sk->sk_refcnt)) {
- WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
- skb->destructor = sock_efree;
- }
- } else {
- skb_orphan(skb);
- }
+ skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);
@@ -2063,6 +2523,18 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
@@ -2118,7 +2590,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
- sysctl_optmem_max)
+ READ_ONCE(sysctl_optmem_max))
return NULL;
skb = alloc_skb(size, priority);
@@ -2136,8 +2608,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned int)size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
+
+ if ((unsigned int)size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
* might sleep.
@@ -2162,7 +2636,7 @@ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
if (WARN_ON_ONCE(!mem))
return;
if (nullify)
- kzfree(mem);
+ kfree_sensitive(mem);
else
kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
@@ -2256,13 +2730,6 @@ failure:
}
EXPORT_SYMBOL(sock_alloc_send_pskb);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
- int noblock, int *errcode)
-{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
-}
-EXPORT_SYMBOL(sock_alloc_send_skb);
-
int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
struct sockcm_cookie *sockc)
{
@@ -2270,7 +2737,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
switch (cmsg->cmsg_type) {
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
@@ -2344,8 +2812,6 @@ static void sk_leave_memory_pressure(struct sock *sk)
}
}
-/* On 32bit arches, an skb frag is limited to 2^15 */
-#define SKB_FRAG_PAGE_ORDER get_order(32768)
DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
/**
@@ -2403,7 +2869,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
}
EXPORT_SYMBOL(sk_page_frag_refill);
-static void __lock_sock(struct sock *sk)
+void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
@@ -2435,7 +2901,7 @@ void __release_sock(struct sock *sk)
do {
next = skb->next;
prefetch(next);
- WARN_ON_ONCE(skb_dst_is_noref(skb));
+ DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
@@ -2460,6 +2926,7 @@ void __sk_flush_backlog(struct sock *sk)
__release_sock(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}
+EXPORT_SYMBOL_GPL(__sk_flush_backlog);
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
@@ -2497,12 +2964,16 @@ EXPORT_SYMBOL(sk_wait_data);
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
+ bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
struct proto *prot = sk->sk_prot;
- long allocated = sk_memory_allocated_add(sk, amt);
bool charged = true;
+ long allocated;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+ if (memcg_charge &&
+ !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ gfp_memcg_charge())))
goto suppress_allocation;
/* Under limit. */
@@ -2556,8 +3027,14 @@ suppress_allocation:
/* Fail only if socket is _under_ its sndbuf.
* In this case we cannot block, so that we have to fail.
*/
- if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
+ /* Force charge with __GFP_NOFAIL */
+ if (memcg_charge && !charged) {
+ mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
+ }
return 1;
+ }
}
if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
@@ -2565,12 +3042,11 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ if (memcg_charge && charged)
mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
return 0;
}
-EXPORT_SYMBOL(__sk_mem_raise_allocated);
/**
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
@@ -2586,10 +3062,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
int ret, amt = sk_mem_pages(size);
- sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc += amt << PAGE_SHIFT;
ret = __sk_mem_raise_allocated(sk, size, amt, kind);
if (!ret)
- sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amt << PAGE_SHIFT;
return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2612,17 +3088,16 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
-EXPORT_SYMBOL(__sk_mem_reduce_allocated);
/**
* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
*/
void __sk_mem_reclaim(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ amount >>= PAGE_SHIFT;
+ sk->sk_forward_alloc -= amount << PAGE_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -2692,20 +3167,6 @@ int sock_no_shutdown(struct socket *sock, int how)
}
EXPORT_SYMBOL(sock_no_shutdown);
-int sock_no_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_setsockopt);
-
-int sock_no_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_getsockopt);
-
int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
return -EOPNOTSUPP;
@@ -2732,6 +3193,21 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *
}
EXPORT_SYMBOL(sock_no_mmap);
+/*
+ * When a file is received (via SCM_RIGHTS, etc), we must bump the
+ * various sock-based usage counts.
+ */
+void __receive_sock(struct file *file)
+{
+ struct socket *sock;
+
+ sock = sock_from_file(file);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
+}
+
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
ssize_t res;
@@ -2811,20 +3287,42 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
+ if (sock_writeable(sk)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
}
+/* An optimised version of sock_def_write_space(), should only be called
+ * for SOCK_RCU_FREE sockets under RCU read section and after putting
+ * ->sk_wmem_alloc.
+ */
+static void sock_def_write_space_wfree(struct sock *sk)
+{
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if (sock_writeable(sk)) {
+ struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+ /* rely on refcount_sub from sock_wfree() */
+ smp_mb__after_atomic();
+ if (wq && waitqueue_active(&wq->wait))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+
+ /* Should agree with poll, otherwise some programs break */
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+}
+
static void sock_def_destruct(struct sock *sk)
{
}
@@ -2852,6 +3350,13 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer)
}
EXPORT_SYMBOL(sk_stop_timer);
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
+{
+ if (del_timer_sync(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer_sync);
+
void sock_init_data(struct socket *sock, struct sock *sk)
{
sk_init_common(sk);
@@ -2860,8 +3365,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
timer_setup(&sk->sk_timer, NULL, 0);
sk->sk_allocation = GFP_KERNEL;
- sk->sk_rcvbuf = sysctl_rmem_default;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
@@ -2901,6 +3406,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
+ spin_lock_init(&sk->sk_peer_lock);
+
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
@@ -2914,18 +3421,19 @@ void sock_init_data(struct socket *sock, struct sock *sk)
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
- sk->sk_ll_usec = sysctl_net_busy_read;
+ sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
#endif
sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
+ sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
+ * (Documentation/RCU/rculist_nulls.rst for details)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
@@ -2935,17 +3443,15 @@ EXPORT_SYMBOL(sock_init_data);
void lock_sock_nested(struct sock *sk, int subclass)
{
+ /* The sk_lock has mutex_lock() semantics here. */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (sk->sk_lock.owned)
+ if (sock_owned_by_user_nocheck(sk))
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
- /*
- * The sk_lock has mutex_lock() semantics here:
- */
- mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
- local_bh_enable();
+ spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(lock_sock_nested);
@@ -2968,41 +3474,37 @@ void release_sock(struct sock *sk)
}
EXPORT_SYMBOL(release_sock);
-/**
- * lock_sock_fast - fast version of lock_sock
- * @sk: socket
- *
- * This version should be used for very small section, where process wont block
- * return false if fast path is taken:
- *
- * sk_lock.slock locked, owned = 0, BH disabled
- *
- * return true if slow path is taken:
- *
- * sk_lock.slock unlocked, owned = 1, BH enabled
- */
-bool lock_sock_fast(struct sock *sk)
+bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (!sk->sk_lock.owned)
+ if (!sock_owned_by_user_nocheck(sk)) {
/*
- * Note : We must disable BH
+ * Fast path return with bottom halves disabled and
+ * sock::sk_lock.slock held.
+ *
+ * The 'mutex' is not contended and holding
+ * sock::sk_lock.slock prevents all other lockers to
+ * proceed so the corresponding unlock_sock_fast() can
+ * avoid the slow path of release_sock() completely and
+ * just release slock.
+ *
+ * From a semantical POV this is equivalent to 'acquiring'
+ * the 'mutex', hence the corresponding lockdep
+ * mutex_release() has to happen in the fast path of
+ * unlock_sock_fast().
*/
return false;
+ }
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
- /*
- * The sk_lock has mutex_lock() semantics here:
- */
- mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
- local_bh_enable();
+ __acquire(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
return true;
}
-EXPORT_SYMBOL(lock_sock_fast);
+EXPORT_SYMBOL(__lock_sock_fast);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
bool timeval, bool time32)
@@ -3108,24 +3610,11 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_getsockopt != NULL)
- return sk->sk_prot->compat_getsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_getsockopt);
-#endif
-
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
@@ -3133,8 +3622,7 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int addr_len = 0;
int err;
- err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
@@ -3145,35 +3633,22 @@ EXPORT_SYMBOL(sock_common_recvmsg);
* Set socket options on an inet socket.
*/
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_setsockopt != NULL)
- return sk->sk_prot->compat_setsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_setsockopt);
-#endif
-
void sk_common_release(struct sock *sk)
{
if (sk->sk_prot->destroy)
sk->sk_prot->destroy(sk);
/*
- * Observation: when sock_common_release is called, processes have
+ * Observation: when sk_common_release is called, processes have
* no access to socket. But net still has.
* Step one, detach it from networking:
*
@@ -3220,19 +3695,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
}
#ifdef CONFIG_PROC_FS
-#define PROTO_INUSE_NR 64 /* should be enough for the first time */
-struct prot_inuse {
- int val[PROTO_INUSE_NR];
-};
-
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
-void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
-{
- __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
-}
-EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
-
int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
int cpu, idx = prot->inuse_idx;
@@ -3245,17 +3709,12 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
-static void sock_inuse_add(struct net *net, int val)
-{
- this_cpu_add(*net->core.sock_inuse, val);
-}
-
int sock_inuse_get(struct net *net)
{
int cpu, res = 0;
for_each_possible_cpu(cpu)
- res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
return res;
}
@@ -3267,22 +3726,12 @@ static int __net_init sock_inuse_init_net(struct net *net)
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
if (net->core.prot_inuse == NULL)
return -ENOMEM;
-
- net->core.sock_inuse = alloc_percpu(int);
- if (net->core.sock_inuse == NULL)
- goto out;
-
return 0;
-
-out:
- free_percpu(net->core.prot_inuse);
- return -ENOMEM;
}
static void __net_exit sock_inuse_exit_net(struct net *net)
{
free_percpu(net->core.prot_inuse);
- free_percpu(net->core.sock_inuse);
}
static struct pernet_operations net_inuse_ops = {
@@ -3328,10 +3777,43 @@ static inline void release_proto_idx(struct proto *prot)
{
}
-static void sock_inuse_add(struct net *net, int val)
+#endif
+
+static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
+ if (!twsk_prot)
+ return;
+ kfree(twsk_prot->twsk_slab_name);
+ twsk_prot->twsk_slab_name = NULL;
+ kmem_cache_destroy(twsk_prot->twsk_slab);
+ twsk_prot->twsk_slab = NULL;
+}
+
+static int tw_prot_init(const struct proto *prot)
+{
+ struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
+
+ if (!twsk_prot)
+ return 0;
+
+ twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
+ prot->name);
+ if (!twsk_prot->twsk_slab_name)
+ return -ENOMEM;
+
+ twsk_prot->twsk_slab =
+ kmem_cache_create(twsk_prot->twsk_slab_name,
+ twsk_prot->twsk_obj_size, 0,
+ SLAB_ACCOUNT | prot->slab_flags,
+ NULL);
+ if (!twsk_prot->twsk_slab) {
+ pr_crit("%s: Can't create timewait sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+
+ return 0;
}
-#endif
static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
@@ -3372,6 +3854,14 @@ int proto_register(struct proto *prot, int alloc_slab)
{
int ret = -ENOBUFS;
+ if (prot->memory_allocated && !prot->sysctl_mem) {
+ pr_err("%s: missing sysctl_mem\n", prot->name);
+ return -EINVAL;
+ }
+ if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
+ pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
+ return -EINVAL;
+ }
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
@@ -3389,37 +3879,23 @@ int proto_register(struct proto *prot, int alloc_slab)
if (req_prot_init(prot))
goto out_free_request_sock_slab;
- if (prot->twsk_prot != NULL) {
- prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
-
- if (prot->twsk_prot->twsk_slab_name == NULL)
- goto out_free_request_sock_slab;
-
- prot->twsk_prot->twsk_slab =
- kmem_cache_create(prot->twsk_prot->twsk_slab_name,
- prot->twsk_prot->twsk_obj_size,
- 0,
- SLAB_ACCOUNT |
- prot->slab_flags,
- NULL);
- if (prot->twsk_prot->twsk_slab == NULL)
- goto out_free_timewait_sock_slab_name;
- }
+ if (tw_prot_init(prot))
+ goto out_free_timewait_sock_slab;
}
mutex_lock(&proto_list_mutex);
ret = assign_proto_idx(prot);
if (ret) {
mutex_unlock(&proto_list_mutex);
- goto out_free_timewait_sock_slab_name;
+ goto out_free_timewait_sock_slab;
}
list_add(&prot->node, &proto_list);
mutex_unlock(&proto_list_mutex);
return ret;
-out_free_timewait_sock_slab_name:
- if (alloc_slab && prot->twsk_prot)
- kfree(prot->twsk_prot->twsk_slab_name);
+out_free_timewait_sock_slab:
+ if (alloc_slab)
+ tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
if (alloc_slab) {
req_prot_cleanup(prot->rsk_prot);
@@ -3443,12 +3919,7 @@ void proto_unregister(struct proto *prot)
prot->slab = NULL;
req_prot_cleanup(prot->rsk_prot);
-
- if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
- kmem_cache_destroy(prot->twsk_prot->twsk_slab);
- kfree(prot->twsk_prot->twsk_slab_name);
- prot->twsk_prot->twsk_slab = NULL;
- }
+ tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);
@@ -3465,6 +3936,7 @@ int sock_load_diag_module(int family, int protocol)
#ifdef CONFIG_INET
if (family == AF_INET &&
protocol != IPPROTO_RAW &&
+ protocol < MAX_INET_PROTOS &&
!rcu_access_pointer(inet_protos[protocol]))
return -ENOENT;
#endif
@@ -3606,3 +4078,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
+{
+ if (!sk->sk_prot->bind_add)
+ return -EOPNOTSUPP;
+ return sk->sk_prot->bind_add(sk, addr, addr_len);
+}
+EXPORT_SYMBOL(sock_bind_add);