From eabf425bc6ad32fa49cfb35c7bc59db07dfdd36e Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Tue, 16 Apr 2024 17:53:43 +0800 Subject: neighbour: guarantee the localhost connections be established successfully even the ARP table is full Inter-process communication on localhost should be established successfully even the ARP table is full, many processes on server machine use the localhost to communicate such as command-line interface (CLI), servers hope all CLI commands can be executed successfully even the arp table is full. Right now CLI commands got timeout when the arp table is full. Set the parameter of exempt_from_gc to be true for LOOPBACK net device to keep localhost neigh in arp table, not removed by gc. the steps of reproduced: server with "gc_thresh3 = 1024" setting, ping server from more than 1024 same netmask Lan IPv4 addresses, run "ssh localhost" on console interface, then the command will get timeout. Signed-off-by: Zheng Li Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240416095343.540-1-lizheng043@gmail.com Signed-off-by: Paolo Abeni --- net/core/neighbour.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 552719c3bbc3..47d07b122f7a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -734,7 +734,9 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); + bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + + return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); -- cgit v1.2.3-59-g8ed1b From f8f2eb9de69a1119117d198547c13d7a1123a5a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Apr 2024 09:51:04 +0000 Subject: neighbour: add RCU protection to neigh_tables[] In order to remove RTNL protection from neightbl_dump_info() and neigh_dump_info() later, we need to add RCU protection to neigh_tables[]. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 47d07b122f7a..ccb770539f1a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1771,7 +1771,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { @@ -1828,13 +1828,19 @@ void neigh_table_init(int index, struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; - neigh_tables[index] = tbl; + rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */ int neigh_table_clear(int index, struct neigh_table *tbl) { - neigh_tables[index] = NULL; + RCU_INIT_POINTER(neigh_tables[index], NULL); + synchronize_rcu(); + /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); @@ -1866,10 +1872,10 @@ static struct neigh_table *neigh_find_table(int family) switch (family) { case AF_INET: - tbl = neigh_tables[NEIGH_ARP_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: - tbl = neigh_tables[NEIGH_ND_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } @@ -2333,7 +2339,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) @@ -2521,7 +2527,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; @@ -2881,7 +2887,7 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; for (t = 0; t < NEIGH_NR_TABLES; t++) { - tbl = neigh_tables[t]; + tbl = rcu_dereference_rtnl(neigh_tables[t]); if (!tbl) continue; @@ -3145,14 +3151,15 @@ int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; - tbl = neigh_tables[index]; - if (!tbl) - goto out; rcu_read_lock(); + tbl = rcu_dereference(neigh_tables[index]); + if (!tbl) + goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); @@ -3168,6 +3175,7 @@ int neigh_xmit(int index, struct net_device *dev, goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { -- cgit v1.2.3-59-g8ed1b From 7e4975f7e7fb0eba3cbb69d9c467750a1c3ce131 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Apr 2024 09:51:05 +0000 Subject: neighbour: fix neigh_dump_info() return value Change neigh_dump_table() and pneigh_dump_table() to either return 0 or -EMSGSIZE if not enough space was available in the skb. Then neigh_dump_info() can do the same. This allows NLMSG_DONE to be appended to the current skb at the end of a dump, saving a couple of recvmsg() system calls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ccb770539f1a..d8c3ffdee29f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2715,7 +2715,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct net *net = sock_net(skb->sk); struct neighbour *n; - int rc, h, s_h = cb->args[1]; + int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; @@ -2737,23 +2737,20 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - flags) < 0) { - rc = -1; + err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags); + if (err < 0) goto out; - } next: idx++; } } - rc = skb->len; out: rcu_read_unlock(); cb->args[1] = h; cb->args[2] = idx; - return rc; + return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2762,7 +2759,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); - int rc, h, s_h = cb->args[3]; + int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; @@ -2780,11 +2777,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, flags, tbl) < 0) { + err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl); + if (err < 0) { read_unlock_bh(&tbl->lock); - rc = -1; goto out; } next: @@ -2793,12 +2790,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, } read_unlock_bh(&tbl->lock); - rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; - return rc; - + return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2905,7 +2900,7 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) } cb->args[0] = t; - return skb->len; + return err; } static int neigh_valid_get_req(const struct nlmsghdr *nlh, -- cgit v1.2.3-59-g8ed1b From ba0f780694237d96a1d6366f931cd716fb0f7ab5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Apr 2024 09:51:06 +0000 Subject: neighbour: no longer hold RTNL in neigh_dump_info() neigh_dump_table() is already relying on RCU protection. pneigh_dump_table() is using its own protection (tbl->lock) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d8c3ffdee29f..0805c00c63d4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2723,7 +2723,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { @@ -2747,7 +2746,6 @@ next: } } out: - rcu_read_unlock(); cb->args[1] = h; cb->args[2] = idx; return err; @@ -2881,8 +2879,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; + rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { - tbl = rcu_dereference_rtnl(neigh_tables[t]); + tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; @@ -2898,6 +2897,7 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } + rcu_read_unlock(); cb->args[0] = t; return err; @@ -3894,7 +3894,8 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, + RTNL_FLAG_DUMP_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); -- cgit v1.2.3-59-g8ed1b From 1c04b46cbdddc7882eeb671521035ea884245b9f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 21 Apr 2024 18:57:53 +0000 Subject: neighbour: fix neigh_master_filtered() If we no longer hold RTNL, we must use netdev_master_upper_dev_get_rcu() instead of netdev_master_upper_dev_get(). Fixes: ba0f78069423 ("neighbour: no longer hold RTNL in neigh_dump_info()") Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240421185753.1808077-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 0805c00c63d4..af270c202d9a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2682,7 +2682,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = dev ? netdev_master_upper_dev_get(dev) : NULL; + master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". -- cgit v1.2.3-59-g8ed1b From ce218712b0f6f97f9e838634548044b970eeca63 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 1 May 2024 11:29:25 +0200 Subject: net: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) * Remove sentinel element from ctl_table structs. * Remove the zeroing out of an array element (to make it look like a sentinel) in neigh_sysctl_register and lowpan_frags_ns_sysctl_register This is not longer needed and is safe after commit c899710fe7f9 ("networking: Update to register_net_sysctl_sz") added the array size to the ctl_table registration. * Replace the for loop stop condition in sysctl_core_net_init that tests for procname == NULL with one that depends on array size * Removed the "-1" in mpls_net_init that adjusted for having an extra empty element when looping over ctl_table arrays * Use a table_size variable to keep the value of ARRAY_SIZE Signed-off-by: Joel Granados Signed-off-by: David S. Miller --- net/core/neighbour.c | 5 +---- net/core/sysctl_net_core.c | 13 ++++++------- net/dccp/sysctl.c | 2 -- net/ieee802154/6lowpan/reassembly.c | 6 +----- net/mpls/af_mpls.c | 13 ++++++------- net/unix/sysctl_net_unix.c | 1 - 6 files changed, 14 insertions(+), 26 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index af270c202d9a..45fd88405b6b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3733,7 +3733,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3784,7 +3784,6 @@ static struct neigh_sysctl_table { .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, - {}, }, }; @@ -3812,8 +3811,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6da5995ac86a..c9fb9ad87485 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -661,7 +661,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { @@ -698,7 +697,6 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -716,20 +714,21 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + size_t table_size = ARRAY_SIZE(netns_core_table); + struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (i = 0; i < table_size; ++i) + tbl[i].data += (char *)net - (char *)&init_net; } - net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, - ARRAY_SIZE(netns_core_table)); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index ee8d4f5afa72..3fc474d6e57d 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -90,8 +90,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - - { } }; static struct ctl_table_header *dccp_table_header; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2a983cf450da..56ef873828f4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -338,7 +338,6 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -351,7 +350,6 @@ static struct ctl_table lowpan_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) @@ -370,10 +368,8 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) table_size = 0; - } } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 5d2012d1cf4a..2dc7a908a6bb 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1377,13 +1377,13 @@ static const struct ctl_table mpls_dev_table[] = { .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, - { } }; static int mpls_dev_sysctl_register(struct net_device *dev, struct mpls_dev *mdev) { char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; + size_t table_size = ARRAY_SIZE(mpls_dev_table); struct net *net = dev_net(dev); struct ctl_table *table; int i; @@ -1395,7 +1395,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) { + for (i = 0; i < table_size; i++) { table[i].data = (char *)mdev + (uintptr_t)table[i].data; table[i].extra1 = mdev; table[i].extra2 = net; @@ -1403,8 +1403,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); - mdev->sysctl = register_net_sysctl_sz(net, path, table, - ARRAY_SIZE(mpls_dev_table)); + mdev->sysctl = register_net_sysctl_sz(net, path, table, table_size); if (!mdev->sysctl) goto free; @@ -2653,11 +2652,11 @@ static const struct ctl_table mpls_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, - { } }; static int mpls_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(mpls_table); struct ctl_table *table; int i; @@ -2673,11 +2672,11 @@ static int mpls_net_init(struct net *net) /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) + for (i = 0; i < table_size; i++) table[i].data = (char *)net + (uintptr_t)table[i].data; net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, - ARRAY_SIZE(mpls_table)); + table_size); if (net->mpls.ctl == NULL) { kfree(table); return -ENOMEM; diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 44996af61999..357b3e5f3847 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -19,7 +19,6 @@ static struct ctl_table unix_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { } }; int __net_init unix_sysctl_register(struct net *net) -- cgit v1.2.3-59-g8ed1b