aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/sock.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--net/core/sock.c406
1 files changed, 274 insertions, 132 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index 41e91d0f7061..a3ba0358c77c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -141,10 +141,13 @@
#include <linux/ethtool.h>
+#include "dev.h"
+
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
-static void sock_inuse_add(struct net *net, int val);
+static void sock_def_write_space_wfree(struct sock *sk);
+static void sock_def_write_space(struct sock *sk);
/**
* sk_ns_capable - General socket capability test
@@ -327,7 +330,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
noreclaim_flag = memalloc_noreclaim_save();
- ret = sk->sk_backlog_rcv(sk, skb);
+ ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
+ tcp_v6_do_rcv,
+ tcp_v4_do_rcv,
+ sk, skb);
memalloc_noreclaim_restore(noreclaim_flag);
return ret;
@@ -502,17 +508,35 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
{
+ enum skb_drop_reason drop_reason;
int err;
err = sk_filter(sk, skb);
- if (err)
- return err;
-
- return __sock_queue_rcv_skb(sk, skb);
+ if (err) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+ goto out;
+ }
+ err = __sock_queue_rcv_skb(sk, skb);
+ switch (err) {
+ case -ENOMEM:
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ break;
+ case -ENOBUFS:
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ break;
+ default:
+ drop_reason = SKB_NOT_DROPPED_YET;
+ break;
+ }
+out:
+ if (reason)
+ *reason = drop_reason;
+ return err;
}
-EXPORT_SYMBOL(sock_queue_rcv_skb);
+EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
const int nested, unsigned int trim_cap, bool refcounted)
@@ -611,7 +635,9 @@ static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
if (ifindex < 0)
goto out;
- sk->sk_bound_dev_if = ifindex;
+ /* Paired with all READ_ONCE() done locklessly. */
+ WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
+
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
@@ -677,22 +703,25 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
goto out;
}
- return sock_bindtoindex(sk, index, true);
+ sockopt_lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, index);
+ sockopt_release_sock(sk);
out:
#endif
return ret;
}
-static int sock_getbindtodevice(struct sock *sk, char __user *optval,
- int __user *optlen, int len)
+static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
struct net *net = sock_net(sk);
char devname[IFNAMSIZ];
- if (sk->sk_bound_dev_if == 0) {
+ if (bound_dev_if == 0) {
len = 0;
goto zero;
}
@@ -701,19 +730,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
if (len < IFNAMSIZ)
goto out;
- ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ ret = netdev_get_name(net, devname, bound_dev_if);
if (ret)
goto out;
len = strlen(devname) + 1;
ret = -EFAULT;
- if (copy_to_user(optval, devname, len))
+ if (copy_to_sockptr(optval, devname, len))
goto out;
zero:
ret = -EFAULT;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
goto out;
ret = 0;
@@ -843,6 +872,8 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
}
num = ethtool_get_phc_vclocks(dev, &vclock_index);
+ dev_put(dev);
+
for (i = 0; i < num; i++) {
if (*(vclock_index + i) == phc_index) {
match = true;
@@ -872,14 +903,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
+ if (sk_is_tcp(sk)) {
if ((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN))
return -EINVAL;
- sk->sk_tskey = tcp_sk(sk)->snd_una;
+ atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
} else {
- sk->sk_tskey = 0;
+ atomic_set(&sk->sk_tskey, 0);
}
}
@@ -963,7 +993,7 @@ EXPORT_SYMBOL(sock_set_mark);
static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
/* Round down bytes to multiple of pages */
- bytes &= ~(SK_MEM_QUANTUM - 1);
+ bytes = round_down(bytes, PAGE_SIZE);
WARN_ON(bytes > sk->sk_reserved_mem);
sk->sk_reserved_mem -= bytes;
@@ -991,7 +1021,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
return -ENOMEM;
/* pre-charge to forward_alloc */
- allocated = sk_memory_allocated_add(sk, pages);
+ sk_memory_allocated_add(sk, pages);
+ allocated = sk_memory_allocated(sk);
/* If the system goes into memory pressure with this
* precharge, give up and return error.
*/
@@ -1000,24 +1031,58 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
return -ENOMEM;
}
- sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc += pages << PAGE_SHIFT;
- sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_reserved_mem += pages << PAGE_SHIFT;
return 0;
}
+void sockopt_lock_sock(struct sock *sk)
+{
+ /* When current->bpf_ctx is set, the setsockopt is called from
+ * a bpf prog. bpf has ensured the sk lock has been
+ * acquired before calling setsockopt().
+ */
+ if (has_current_bpf_ctx())
+ return;
+
+ lock_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_lock_sock);
+
+void sockopt_release_sock(struct sock *sk)
+{
+ if (has_current_bpf_ctx())
+ return;
+
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_release_sock);
+
+bool sockopt_ns_capable(struct user_namespace *ns, int cap)
+{
+ return has_current_bpf_ctx() || ns_capable(ns, cap);
+}
+EXPORT_SYMBOL(sockopt_ns_capable);
+
+bool sockopt_capable(int cap)
+{
+ return has_current_bpf_ctx() || capable(cap);
+}
+EXPORT_SYMBOL(sockopt_capable);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
-int sock_setsockopt(struct socket *sock, int level, int optname,
- sockptr_t optval, unsigned int optlen)
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct so_timestamping timestamping;
+ struct socket *sock = sk->sk_socket;
struct sock_txtime sk_txtime;
- struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
@@ -1038,11 +1103,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
valbool = val ? 1 : 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case SO_DEBUG:
- if (val && !capable(CAP_NET_ADMIN))
+ if (val && !sockopt_capable(CAP_NET_ADMIN))
ret = -EACCES;
else
sock_valbool_flag(sk, SOCK_DBG, valbool);
@@ -1072,7 +1137,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_wmem_max);
+ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
/* Ensure val * 2 fits into an int, to prevent max_t()
* from treating it as a negative value.
@@ -1086,7 +1151,7 @@ set_sndbuf:
break;
case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1104,11 +1169,11 @@ set_sndbuf:
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
+ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
break;
case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1135,7 +1200,8 @@ set_sndbuf:
case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
ret = -EPERM;
@@ -1198,7 +1264,7 @@ set_sndbuf:
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- if (sock->ops->set_rcvlowat)
+ if (sock && sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
@@ -1280,13 +1346,23 @@ set_sndbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
__sock_set_mark(sk, val);
break;
+ case SO_RCVMARK:
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+
+ sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
+ break;
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
@@ -1314,7 +1390,7 @@ set_sndbuf:
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
/* allow unprivileged users to decrease the value */
- if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else {
if (val < 0)
@@ -1324,13 +1400,13 @@ set_sndbuf:
}
break;
case SO_PREFER_BUSY_POLL:
- if (valbool && !capable(CAP_NET_ADMIN))
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
break;
case SO_BUSY_POLL_BUDGET:
- if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
} else {
if (val < 0 || val > U16_MAX)
@@ -1370,13 +1446,12 @@ set_sndbuf:
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (!((sk->sk_type == SOCK_STREAM &&
- sk->sk_protocol == IPPROTO_TCP) ||
+ if (!(sk_is_tcp(sk) ||
(sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP)))
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
} else if (sk->sk_family != PF_RDS) {
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
}
if (!ret) {
if (val < 0 || val > 1)
@@ -1402,7 +1477,7 @@ set_sndbuf:
* scheduler has enough safe guards.
*/
if (sk_txtime.clockid != CLOCK_MONOTONIC &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1444,13 +1519,29 @@ set_sndbuf:
break;
}
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Paired with READ_ONCE() in tcp_rtx_synack() */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return sk_setsockopt(sock->sk, level, optname,
+ optval, optlen);
+}
EXPORT_SYMBOL(sock_setsockopt);
static const struct cred *sk_get_peer_cred(struct sock *sk)
@@ -1477,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
-static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
struct user_namespace *user_ns = current_user_ns();
int i;
- for (i = 0; i < src->ngroups; i++)
- if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ for (i = 0; i < src->ngroups; i++) {
+ gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
+
+ if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
return -EFAULT;
+ }
return 0;
}
-int sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
- struct sock *sk = sock->sk;
+ struct socket *sock = sk->sk_socket;
union {
int val;
@@ -1509,7 +1603,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
int lv = sizeof(int);
int len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
@@ -1644,7 +1738,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
spin_unlock(&sk->sk_peer_lock);
- if (copy_to_user(optval, &peercred, len))
+ if (copy_to_sockptr(optval, &peercred, len))
return -EFAULT;
goto lenout;
}
@@ -1662,11 +1756,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
put_cred(cred);
- return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
- ret = groups_to_user((gid_t __user *)optval, cred->group_info);
+ ret = groups_to_user(optval, cred->group_info);
put_cred(cred);
if (ret)
return ret;
@@ -1682,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return -ENOTCONN;
if (lv < len)
return -EINVAL;
- if (copy_to_user(optval, address, len))
+ if (copy_to_sockptr(optval, address, len))
return -EFAULT;
goto lenout;
}
@@ -1699,12 +1793,16 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PEERSEC:
- return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len);
case SO_MARK:
v.val = sk->sk_mark;
break;
+ case SO_RCVMARK:
+ v.val = sock_flag(sk, SOCK_RCVMARK);
+ break;
+
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL);
break;
@@ -1727,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return sock_getbindtodevice(sk, optval, optlen, len);
case SO_GET_FILTER:
- len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ len = sk_get_filter(sk, optval, len);
if (len < 0)
return len;
@@ -1775,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
sk_get_meminfo(sk, meminfo);
len = min_t(unsigned int, len, sizeof(meminfo));
- if (copy_to_user(optval, &meminfo, len))
+ if (copy_to_sockptr(optval, &meminfo, len))
return -EFAULT;
goto lenout;
@@ -1813,7 +1911,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BINDTOIFINDEX:
- v.val = sk->sk_bound_dev_if;
+ v.val = READ_ONCE(sk->sk_bound_dev_if);
break;
case SO_NETNS_COOKIE:
@@ -1831,6 +1929,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_reserved_mem;
break;
+ case SO_TXREHASH:
+ v.val = sk->sk_txrehash;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -1840,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len > lv)
len = lv;
- if (copy_to_user(optval, &v, len))
+ if (copy_to_sockptr(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return sk_getsockopt(sock->sk, level, optname,
+ USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
+}
+
/*
* Initialize an sk_lock.
*
@@ -1982,7 +2092,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt)) {
- get_net(net);
+ get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
}
@@ -2038,7 +2148,7 @@ static void __sk_destruct(struct rcu_head *head)
put_pid(sk->sk_peer_pid);
if (likely(sk->sk_net_refcnt))
- put_net(sock_net(sk));
+ put_net_track(sock_net(sk), &sk->ns_tracker);
sk_prot_free(sk->sk_prot_creator, sk);
}
@@ -2125,7 +2235,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
/* SANITY */
if (likely(newsk->sk_net_refcnt)) {
- get_net(sock_net(newsk));
+ get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
sock_inuse_add(sock_net(newsk), 1);
}
sk_node_init(&newsk->sk_node);
@@ -2241,22 +2351,42 @@ void sk_free_unlock_clone(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+static void sk_trim_gso_size(struct sock *sk)
+{
+ if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
+ return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ sk_is_tcp(sk) &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return;
+#endif
+ sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
+}
+
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
sk_dst_set(sk, dst);
- sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
+ sk->sk_route_caps = dst->dev->features;
+ if (sk_is_tcp(sk))
+ sk->sk_route_caps |= NETIF_F_GSO;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
- sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (unlikely(sk->sk_gso_disabled))
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
if (sk_can_gso(sk)) {
if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = dst->dev->gso_max_size;
- max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
+ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+ sk_trim_gso_size(sk);
+ sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
+ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
@@ -2275,8 +2405,20 @@ void sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
unsigned int len = skb->truesize;
+ bool free;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ if (sock_flag(sk, SOCK_RCU_FREE) &&
+ sk->sk_write_space == sock_def_write_space) {
+ rcu_read_lock();
+ free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
+ sock_def_write_space_wfree(sk);
+ rcu_read_unlock();
+ if (unlikely(free))
+ __sk_free(sk);
+ return;
+ }
+
/*
* Keep a reference on sk_wmem_alloc, this will be released
* after sk_write_space() call
@@ -2448,7 +2590,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
- sysctl_optmem_max)
+ READ_ONCE(sysctl_optmem_max))
return NULL;
skb = alloc_skb(size, priority);
@@ -2466,8 +2608,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned int)size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
+
+ if ((unsigned int)size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
* might sleep.
@@ -2586,13 +2730,6 @@ failure:
}
EXPORT_SYMBOL(sock_alloc_send_pskb);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
- int noblock, int *errcode)
-{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
-}
-EXPORT_SYMBOL(sock_alloc_send_skb);
-
int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
struct sockcm_cookie *sockc)
{
@@ -2600,7 +2737,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
switch (cmsg->cmsg_type) {
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
@@ -2763,7 +2901,7 @@ void __release_sock(struct sock *sk)
do {
next = skb->next;
prefetch(next);
- WARN_ON_ONCE(skb_dst_is_noref(skb));
+ DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
@@ -2788,6 +2926,7 @@ void __sk_flush_backlog(struct sock *sk)
__release_sock(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}
+EXPORT_SYMBOL_GPL(__sk_flush_backlog);
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
@@ -2825,11 +2964,13 @@ EXPORT_SYMBOL(sk_wait_data);
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
- struct proto *prot = sk->sk_prot;
- long allocated = sk_memory_allocated_add(sk, amt);
bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
+ struct proto *prot = sk->sk_prot;
bool charged = true;
+ long allocated;
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
if (memcg_charge &&
!(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
gfp_memcg_charge())))
@@ -2906,7 +3047,6 @@ suppress_allocation:
return 0;
}
-EXPORT_SYMBOL(__sk_mem_raise_allocated);
/**
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
@@ -2922,10 +3062,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
int ret, amt = sk_mem_pages(size);
- sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc += amt << PAGE_SHIFT;
ret = __sk_mem_raise_allocated(sk, size, amt, kind);
if (!ret)
- sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amt << PAGE_SHIFT;
return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2948,17 +3088,16 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
-EXPORT_SYMBOL(__sk_mem_reduce_allocated);
/**
* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
*/
void __sk_mem_reclaim(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ amount >>= PAGE_SHIFT;
+ sk->sk_forward_alloc -= amount << PAGE_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -3148,20 +3287,42 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
+ if (sock_writeable(sk)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
}
+/* An optimised version of sock_def_write_space(), should only be called
+ * for SOCK_RCU_FREE sockets under RCU read section and after putting
+ * ->sk_wmem_alloc.
+ */
+static void sock_def_write_space_wfree(struct sock *sk)
+{
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if (sock_writeable(sk)) {
+ struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+ /* rely on refcount_sub from sock_wfree() */
+ smp_mb__after_atomic();
+ if (wq && waitqueue_active(&wq->wait))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+
+ /* Should agree with poll, otherwise some programs break */
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+}
+
static void sock_def_destruct(struct sock *sk)
{
}
@@ -3204,8 +3365,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
timer_setup(&sk->sk_timer, NULL, 0);
sk->sk_allocation = GFP_KERNEL;
- sk->sk_rcvbuf = sysctl_rmem_default;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
@@ -3260,13 +3421,14 @@ void sock_init_data(struct socket *sock, struct sock *sk)
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
- sk->sk_ll_usec = sysctl_net_busy_read;
+ sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
#endif
sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
+ sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
@@ -3286,7 +3448,7 @@ void lock_sock_nested(struct sock *sk, int subclass)
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (sk->sk_lock.owned)
+ if (sock_owned_by_user_nocheck(sk))
__lock_sock(sk);
sk->sk_lock.owned = 1;
spin_unlock_bh(&sk->sk_lock.slock);
@@ -3317,7 +3479,7 @@ bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (!sk->sk_lock.owned) {
+ if (!sock_owned_by_user_nocheck(sk)) {
/*
* Fast path return with bottom halves disabled and
* sock::sk_lock.slock held.
@@ -3448,7 +3610,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);
@@ -3459,8 +3622,7 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int addr_len = 0;
int err;
- err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
@@ -3475,7 +3637,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);
@@ -3532,19 +3695,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
}
#ifdef CONFIG_PROC_FS
-#define PROTO_INUSE_NR 64 /* should be enough for the first time */
-struct prot_inuse {
- int val[PROTO_INUSE_NR];
-};
-
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
-void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
-{
- __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
-}
-EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
-
int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
int cpu, idx = prot->inuse_idx;
@@ -3557,17 +3709,12 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
-static void sock_inuse_add(struct net *net, int val)
-{
- this_cpu_add(*net->core.sock_inuse, val);
-}
-
int sock_inuse_get(struct net *net)
{
int cpu, res = 0;
for_each_possible_cpu(cpu)
- res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
return res;
}
@@ -3579,22 +3726,12 @@ static int __net_init sock_inuse_init_net(struct net *net)
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
if (net->core.prot_inuse == NULL)
return -ENOMEM;
-
- net->core.sock_inuse = alloc_percpu(int);
- if (net->core.sock_inuse == NULL)
- goto out;
-
return 0;
-
-out:
- free_percpu(net->core.prot_inuse);
- return -ENOMEM;
}
static void __net_exit sock_inuse_exit_net(struct net *net)
{
free_percpu(net->core.prot_inuse);
- free_percpu(net->core.sock_inuse);
}
static struct pernet_operations net_inuse_ops = {
@@ -3640,9 +3777,6 @@ static inline void release_proto_idx(struct proto *prot)
{
}
-static void sock_inuse_add(struct net *net, int val)
-{
-}
#endif
static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
@@ -3720,6 +3854,14 @@ int proto_register(struct proto *prot, int alloc_slab)
{
int ret = -ENOBUFS;
+ if (prot->memory_allocated && !prot->sysctl_mem) {
+ pr_err("%s: missing sysctl_mem\n", prot->name);
+ return -EINVAL;
+ }
+ if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
+ pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
+ return -EINVAL;
+ }
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,