aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/sock_reuseport.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-15 15:04:25 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-15 15:04:25 -0700
commit9a76aba02a37718242d7cdc294f0a3901928aa57 (patch)
tree2040d038f85d2120f21af83b0793efd5af1864e3 /net/core/sock_reuseport.c
parentx86: i8259: Add missing include file (diff)
parentbpf: test: fix spelling mistake "REUSEEPORT" -> "REUSEPORT" (diff)
downloadlinux-dev-9a76aba02a37718242d7cdc294f0a3901928aa57.tar.xz
linux-dev-9a76aba02a37718242d7cdc294f0a3901928aa57.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: - Gustavo A. R. Silva keeps working on the implicit switch fallthru changes. - Support 802.11ax High-Efficiency wireless in cfg80211 et al, From Luca Coelho. - Re-enable ASPM in r8169, from Kai-Heng Feng. - Add virtual XFRM interfaces, which avoids all of the limitations of existing IPSEC tunnels. From Steffen Klassert. - Convert GRO over to use a hash table, so that when we have many flows active we don't traverse a long list during accumluation. - Many new self tests for routing, TC, tunnels, etc. Too many contributors to mention them all, but I'm really happy to keep seeing this stuff. - Hardware timestamping support for dpaa_eth/fsl-fman from Yangbo Lu. - Lots of cleanups and fixes in L2TP code from Guillaume Nault. - Add IPSEC offload support to netdevsim, from Shannon Nelson. - Add support for slotting with non-uniform distribution to netem packet scheduler, from Yousuk Seung. - Add UDP GSO support to mlx5e, from Boris Pismenny. - Support offloading of Team LAG in NFP, from John Hurley. - Allow to configure TX queue selection based upon RX queue, from Amritha Nambiar. - Support ethtool ring size configuration in aquantia, from Anton Mikaev. - Support DSCP and flowlabel per-transport in SCTP, from Xin Long. - Support list based batching and stack traversal of SKBs, this is very exciting work. From Edward Cree. - Busyloop optimizations in vhost_net, from Toshiaki Makita. - Introduce the ETF qdisc, which allows time based transmissions. IGB can offload this in hardware. From Vinicius Costa Gomes. - Add parameter support to devlink, from Moshe Shemesh. - Several multiplication and division optimizations for BPF JIT in nfp driver, from Jiong Wang. - Lots of prepatory work to make more of the packet scheduler layer lockless, when possible, from Vlad Buslov. - Add ACK filter and NAT awareness to sch_cake packet scheduler, from Toke Høiland-Jørgensen. - Support regions and region snapshots in devlink, from Alex Vesker. - Allow to attach XDP programs to both HW and SW at the same time on a given device, with initial support in nfp. From Jakub Kicinski. - Add TLS RX offload and support in mlx5, from Ilya Lesokhin. - Use PHYLIB in r8169 driver, from Heiner Kallweit. - All sorts of changes to support Spectrum 2 in mlxsw driver, from Ido Schimmel. - PTP support in mv88e6xxx DSA driver, from Andrew Lunn. - Make TCP_USER_TIMEOUT socket option more accurate, from Jon Maxwell. - Support for templates in packet scheduler classifier, from Jiri Pirko. - IPV6 support in RDS, from Ka-Cheong Poon. - Native tproxy support in nf_tables, from Máté Eckl. - Maintain IP fragment queue in an rbtree, but optimize properly for in-order frags. From Peter Oskolkov. - Improvde handling of ACKs on hole repairs, from Yuchung Cheng" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1996 commits) bpf: test: fix spelling mistake "REUSEEPORT" -> "REUSEPORT" hv/netvsc: Fix NULL dereference at single queue mode fallback net: filter: mark expected switch fall-through xen-netfront: fix warn message as irq device name has '/' cxgb4: Add new T5 PCI device ids 0x50af and 0x50b0 net: dsa: mv88e6xxx: missing unlock on error path rds: fix building with IPV6=m inet/connection_sock: prefer _THIS_IP_ to current_text_addr net: dsa: mv88e6xxx: bitwise vs logical bug net: sock_diag: Fix spectre v1 gadget in __sock_diag_cmd() ieee802154: hwsim: using right kind of iteration net: hns3: Add vlan filter setting by ethtool command -K net: hns3: Set tx ring' tc info when netdev is up net: hns3: Remove tx ring BD len register in hns3_enet net: hns3: Fix desc num set to default when setting channel net: hns3: Fix for phy link issue when using marvell phy driver net: hns3: Fix for information of phydev lost problem when down/up net: hns3: Fix for command format parsing error in hclge_is_all_function_id_zero net: hns3: Add support for serdes loopback selftest bnxt_en: take coredump_record structure off stack ...
Diffstat (limited to 'net/core/sock_reuseport.c')
-rw-r--r--net/core/sock_reuseport.c92
1 files changed, 76 insertions, 16 deletions
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 064acb04be0f..ba5cba56f574 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -8,11 +8,34 @@
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
+#include <linux/idr.h>
+#include <linux/filter.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
-static DEFINE_SPINLOCK(reuseport_lock);
+DEFINE_SPINLOCK(reuseport_lock);
+
+#define REUSEPORT_MIN_ID 1
+static DEFINE_IDA(reuseport_ida);
+
+int reuseport_get_id(struct sock_reuseport *reuse)
+{
+ int id;
+
+ if (reuse->reuseport_id)
+ return reuse->reuseport_id;
+
+ id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
+ /* Called under reuseport_lock */
+ GFP_ATOMIC);
+ if (id < 0)
+ return id;
+
+ reuse->reuseport_id = id;
+
+ return reuse->reuseport_id;
+}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
@@ -29,7 +52,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
return reuse;
}
-int reuseport_alloc(struct sock *sk)
+int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
@@ -41,9 +64,17 @@ int reuseport_alloc(struct sock *sk)
/* Allocation attempts can occur concurrently via the setsockopt path
* and the bind/hash path. Nothing to do when we lose the race.
*/
- if (rcu_dereference_protected(sk->sk_reuseport_cb,
- lockdep_is_held(&reuseport_lock)))
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (reuse) {
+ /* Only set reuse->bind_inany if the bind_inany is true.
+ * Otherwise, it will overwrite the reuse->bind_inany
+ * which was set by the bind/hash path.
+ */
+ if (bind_inany)
+ reuse->bind_inany = bind_inany;
goto out;
+ }
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
@@ -53,6 +84,7 @@ int reuseport_alloc(struct sock *sk)
reuse->socks[0] = sk;
reuse->num_socks = 1;
+ reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
@@ -78,9 +110,12 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
+ more_reuse->reuseport_id = reuse->reuseport_id;
+ more_reuse->bind_inany = reuse->bind_inany;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
+ more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
for (i = 0; i < reuse->num_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
@@ -99,8 +134,9 @@ static void reuseport_free_rcu(struct rcu_head *head)
struct sock_reuseport *reuse;
reuse = container_of(head, struct sock_reuseport, rcu);
- if (reuse->prog)
- bpf_prog_destroy(reuse->prog);
+ sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
+ if (reuse->reuseport_id)
+ ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
@@ -110,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head)
* @sk2: Socket belonging to the existing reuseport group.
* May return ENOMEM and not add socket to group under memory pressure.
*/
-int reuseport_add_sock(struct sock *sk, struct sock *sk2)
+int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
{
struct sock_reuseport *old_reuse, *reuse;
if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
- int err = reuseport_alloc(sk2);
+ int err = reuseport_alloc(sk2, bind_inany);
if (err)
return err;
@@ -160,6 +196,14 @@ void reuseport_detach_sock(struct sock *sk)
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
+
+ /* At least one of the sk in this reuseport group is added to
+ * a bpf map. Notify the bpf side. The bpf map logic will
+ * remove the sk if it is indeed added to a bpf map.
+ */
+ if (reuse->reuseport_id)
+ bpf_sk_reuseport_detach(sk);
+
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
for (i = 0; i < reuse->num_socks; i++) {
@@ -175,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk)
}
EXPORT_SYMBOL(reuseport_detach_sock);
-static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
- struct bpf_prog *prog, struct sk_buff *skb,
- int hdr_len)
+static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
{
struct sk_buff *nskb = NULL;
u32 index;
@@ -238,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk,
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
- if (prog && skb)
- sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ if (!prog || !skb)
+ goto select_by_hash;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ else
+ sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
+select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2)
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
@@ -252,12 +302,21 @@ out:
}
EXPORT_SYMBOL(reuseport_select_sock);
-struct bpf_prog *
-reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
+ if (sk_unhashed(sk) && sk->sk_reuseport) {
+ int err = reuseport_alloc(sk, false);
+
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
@@ -266,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
rcu_assign_pointer(reuse->prog, prog);
spin_unlock_bh(&reuseport_lock);
- return old_prog;
+ sk_reuseport_prog_free(old_prog);
+ return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);