aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/sock.c
diff options
context:
space:
mode:
authorWei Wang <weiwan@google.com>2021-09-29 10:25:11 -0700
committerDavid S. Miller <davem@davemloft.net>2021-09-30 13:36:46 +0100
commit2bb2f5fb21b0486ff69b7b4a1fe03a760527d133 (patch)
tree1e1b3e094fd135390997b383ccdd38a85eecbfe0 /net/core/sock.c
parentnet: phy: marvell10g: add downshift tunable support (diff)
downloadlinux-dev-2bb2f5fb21b0486ff69b7b4a1fe03a760527d133.tar.xz
linux-dev-2bb2f5fb21b0486ff69b7b4a1fe03a760527d133.zip
net: add new socket option SO_RESERVE_MEM
This socket option provides a mechanism for users to reserve a certain amount of memory for the socket to use. When this option is set, kernel charges the user specified amount of memory to memcg, as well as sk_forward_alloc. This amount of memory is not reclaimable and is available in sk_forward_alloc for this socket. With this socket option set, the networking stack spends less cycles doing forward alloc and reclaim, which should lead to better system performance, with the cost of an amount of pre-allocated and unreclaimable memory, even under memory pressure. Note: This socket option is only available when memory cgroup is enabled and we require this reserved memory to be charged to the user's memcg. We hope this could avoid mis-behaving users to abused this feature to reserve a large amount on certain sockets and cause unfairness for others. Signed-off-by: Wei Wang <weiwan@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/sock.c')
-rw-r--r--net/core/sock.c69
1 files changed, 69 insertions, 0 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index 512e629f9780..0ecb8590e043 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -947,6 +947,53 @@ void sock_set_mark(struct sock *sk, u32 val)
}
EXPORT_SYMBOL(sock_set_mark);
+static void sock_release_reserved_memory(struct sock *sk, int bytes)
+{
+ /* Round down bytes to multiple of pages */
+ bytes &= ~(SK_MEM_QUANTUM - 1);
+
+ WARN_ON(bytes > sk->sk_reserved_mem);
+ sk->sk_reserved_mem -= bytes;
+ sk_mem_reclaim(sk);
+}
+
+static int sock_reserve_memory(struct sock *sk, int bytes)
+{
+ long allocated;
+ bool charged;
+ int pages;
+
+ if (!mem_cgroup_sockets_enabled || !sk->sk_memcg)
+ return -EOPNOTSUPP;
+
+ if (!bytes)
+ return 0;
+
+ pages = sk_mem_pages(bytes);
+
+ /* pre-charge to memcg */
+ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!charged)
+ return -ENOMEM;
+
+ /* pre-charge to forward_alloc */
+ allocated = sk_memory_allocated_add(sk, pages);
+ /* If the system goes into memory pressure with this
+ * precharge, give up and return error.
+ */
+ if (allocated > sk_prot_mem_limits(sk, 1)) {
+ sk_memory_allocated_sub(sk, pages);
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
+ return -ENOMEM;
+ }
+ sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
+
+ sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
+
+ return 0;
+}
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
@@ -1367,6 +1414,23 @@ set_sndbuf:
~SOCK_BUF_LOCK_MASK);
break;
+ case SO_RESERVE_MEM:
+ {
+ int delta;
+
+ if (val < 0) {
+ ret = -EINVAL;
+ break;
+ }
+
+ delta = val - sk->sk_reserved_mem;
+ if (delta < 0)
+ sock_release_reserved_memory(sk, -delta);
+ else
+ ret = sock_reserve_memory(sk, delta);
+ break;
+ }
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1733,6 +1797,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
break;
+ case SO_RESERVE_MEM:
+ v.val = sk->sk_reserved_mem;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2045,6 +2113,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
+ newsk->sk_reserved_mem = 0;
atomic_set(&newsk->sk_drops, 0);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;