From 109582af18b9aade9385ea6609a792f80a7d70ca Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:47 +0200 Subject: netns: returns always an id in __peernet2id() All callers of this function expect a nsid, not an error. Thus, returns NETNSA_NSID_NOT_ASSIGNED in case of error so that callers don't have to convert the error to NETNSA_NSID_NOT_ASSIGNED. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 78fc04ad36fc..294d38742e2a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -192,10 +192,12 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) if (id > 0) return id; - if (alloc) - return alloc_netid(net, peer, -1); + if (alloc) { + id = alloc_netid(net, peer, -1); + return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + } - return -ENOENT; + return NETNSA_NSID_NOT_ASSIGNED; } /* This function returns the id of a peer netns. If no id is assigned, one will @@ -204,10 +206,8 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) int peernet2id(struct net *net, struct net *peer) { bool alloc = atomic_read(&peer->count) == 0 ? false : true; - int id; - id = __peernet2id(net, peer, alloc); - return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + return __peernet2id(net, peer, alloc); } EXPORT_SYMBOL(peernet2id); @@ -554,13 +554,10 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nsid >= 0) { + if (nsid >= 0) id = nsid; - } else { + else id = __peernet2id(net, peer, false); - if (id < 0) - id = NETNSA_NSID_NOT_ASSIGNED; - } if (nla_put_s32(skb, NETNSA_NSID, id)) goto nla_put_failure; -- cgit v1.2.3-59-g8ed1b From cab3c8ec8d57ef48ed754ee7acf2b9bdce80fa5f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:48 +0200 Subject: netns: always provide the id to rtnl_net_fill() The goal of this commit is to prepare the rework of the locking of nsnid protection. After this patch, rtnl_net_notifyid() will not call anymore __peernet2id(), ie no idr_* operation into this function. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 294d38742e2a..37c68bb72db3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -147,8 +147,7 @@ static void ops_free_list(const struct pernet_operations *ops, } } -static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, - int id); +static void rtnl_net_notifyid(struct net *net, int cmd, int id); static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0, id; @@ -162,7 +161,7 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid) id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); if (id >= 0) - rtnl_net_notifyid(net, peer, RTM_NEWNSID, id); + rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; } @@ -365,7 +364,7 @@ static void cleanup_net(struct work_struct *work) int id = __peernet2id(tmp, net, false); if (id >= 0) { - rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); + rtnl_net_notifyid(tmp, RTM_DELNSID, id); idr_remove(&tmp->netns_ids, id); } } @@ -538,14 +537,10 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, struct net *peer, - int nsid) + int cmd, struct net *net, int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - int id; - - ASSERT_RTNL(); nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); if (!nlh) @@ -554,11 +549,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nsid >= 0) - id = nsid; - else - id = __peernet2id(net, peer, false); - if (nla_put_s32(skb, NETNSA_NSID, id)) + if (nla_put_s32(skb, NETNSA_NSID, nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -575,7 +566,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[NETNSA_MAX + 1]; struct sk_buff *msg; struct net *peer; - int err; + int err, id; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy); @@ -597,8 +588,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } + id = __peernet2id(net, peer, false); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_GETNSID, net, peer, -1); + RTM_GETNSID, net, id); if (err < 0) goto err_out; @@ -630,7 +622,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, peer, id); + RTM_NEWNSID, net_cb->net, id); if (ret < 0) return ret; @@ -658,8 +650,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, - int id) +static void rtnl_net_notifyid(struct net *net, int cmd, int id) { struct sk_buff *msg; int err = -ENOMEM; @@ -668,7 +659,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id); + err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); if (err < 0) goto err_out; -- cgit v1.2.3-59-g8ed1b From 7a0877d4b438886b72be61632eaa774d13262f70 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:49 +0200 Subject: netns: rename peernet2id() to peernet2id_alloc() In a following commit, a new function will be introduced to only lookup for a nsid (no allocation if the nsid doesn't exist). To avoid confusion, the existing function is renamed. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 +- include/net/net_namespace.h | 2 +- net/core/net_namespace.c | 4 ++-- net/core/rtnetlink.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3517ab0aa803..48341ae49012 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -336,7 +336,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (!net_eq(dev_net(vxlan->dev), vxlan->net) && nla_put_s32(skb, NDA_LINK_NETNSID, - peernet2id(dev_net(vxlan->dev), vxlan->net))) + peernet2id_alloc(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f733656404de..6d1e2eae32fb 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -271,7 +271,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #define __net_initconst __initconst #endif -int peernet2id(struct net *net, struct net *peer); +int peernet2id_alloc(struct net *net, struct net *peer); struct net *get_net_ns_by_id(struct net *net, int id); struct pernet_operations { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 37c68bb72db3..9c806ac569f9 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -202,13 +202,13 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ -int peernet2id(struct net *net, struct net *peer) +int peernet2id_alloc(struct net *net, struct net *peer) { bool alloc = atomic_read(&peer->count) == 0 ? false : true; return __peernet2id(net, peer, alloc); } -EXPORT_SYMBOL(peernet2id); +EXPORT_SYMBOL(peernet2id_alloc); struct net *get_net_ns_by_id(struct net *net, int id) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 666e0928ba40..83e08323fdcd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1204,7 +1204,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id(dev_net(dev), link_net); + int id = peernet2id_alloc(dev_net(dev), link_net); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) goto nla_put_failure; -- cgit v1.2.3-59-g8ed1b From 3138dbf881274cb20d9aa1b307861f689e820fbe Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:50 +0200 Subject: netns: notify new nsid outside __peernet2id() There is no functional change with this patch. It will ease the refactoring of the locking system that protects nsids and the support of the netlink socket option NETLINK_LISTEN_ALL_NSID. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/net_namespace.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9c806ac569f9..ee864241f8d6 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -147,10 +147,9 @@ static void ops_free_list(const struct pernet_operations *ops, } } -static void rtnl_net_notifyid(struct net *net, int cmd, int id); static int alloc_netid(struct net *net, struct net *peer, int reqid) { - int min = 0, max = 0, id; + int min = 0, max = 0; ASSERT_RTNL(); @@ -159,11 +158,7 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid) max = reqid + 1; } - id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); - if (id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id); - - return id; + return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -179,34 +174,50 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -static int __peernet2id(struct net *net, struct net *peer, bool alloc) +static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); + bool alloc_it = *alloc; ASSERT_RTNL(); + *alloc = false; + /* Magic value for id 0. */ if (id == NET_ID_ZERO) return 0; if (id > 0) return id; - if (alloc) { + if (alloc_it) { id = alloc_netid(net, peer, -1); + *alloc = true; return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; } return NETNSA_NSID_NOT_ASSIGNED; } +static int __peernet2id(struct net *net, struct net *peer) +{ + bool no = false; + + return __peernet2id_alloc(net, peer, &no); +} + +static void rtnl_net_notifyid(struct net *net, int cmd, int id); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ int peernet2id_alloc(struct net *net, struct net *peer) { bool alloc = atomic_read(&peer->count) == 0 ? false : true; + int id; - return __peernet2id(net, peer, alloc); + id = __peernet2id_alloc(net, peer, &alloc); + if (alloc && id >= 0) + rtnl_net_notifyid(net, RTM_NEWNSID, id); + return id; } EXPORT_SYMBOL(peernet2id_alloc); @@ -361,7 +372,7 @@ static void cleanup_net(struct work_struct *work) list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); for_each_net(tmp) { - int id = __peernet2id(tmp, net, false); + int id = __peernet2id(tmp, net); if (id >= 0) { rtnl_net_notifyid(tmp, RTM_DELNSID, id); @@ -516,14 +527,16 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); - if (__peernet2id(net, peer, false) >= 0) { + if (__peernet2id(net, peer) >= 0) { err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); - if (err > 0) + if (err >= 0) { + rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; + } out: put_net(peer); return err; @@ -588,7 +601,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - id = __peernet2id(net, peer, false); + id = __peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, RTM_GETNSID, net, id); if (err < 0) -- cgit v1.2.3-59-g8ed1b From 95f38411df055a0ecefe3a3d119d98241087d5ca Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:51 +0200 Subject: netns: use a spin_lock to protect nsid management Before this patch, nsid were protected by the rtnl lock. The goal of this patch is to be able to find a nsid without needing to hold the rtnl lock. The next patch will introduce a netlink socket option to listen to all netns that have a nsid assigned into the netns where the socket is opened. Thus, it's important to call rtnl_net_notifyid() outside the spinlock, to avoid a recursive lock (nsid are notified via rtnl). This was the main reason of the previous patch. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 57 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ee864241f8d6..ae5008b097de 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -28,6 +28,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; DEFINE_MUTEX(net_mutex); +static DEFINE_SPINLOCK(nsid_lock); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -147,18 +148,17 @@ static void ops_free_list(const struct pernet_operations *ops, } } +/* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0; - ASSERT_RTNL(); - if (reqid >= 0) { min = reqid; max = reqid + 1; } - return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); + return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -174,13 +174,15 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } +/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc + * is set to true, thus the caller knows that the new id must be notified via + * rtnl. + */ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); bool alloc_it = *alloc; - ASSERT_RTNL(); - *alloc = false; /* Magic value for id 0. */ @@ -198,6 +200,7 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) return NETNSA_NSID_NOT_ASSIGNED; } +/* should be called with nsid_lock held */ static int __peernet2id(struct net *net, struct net *peer) { bool no = false; @@ -211,27 +214,46 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id); */ int peernet2id_alloc(struct net *net, struct net *peer) { - bool alloc = atomic_read(&peer->count) == 0 ? false : true; + unsigned long flags; + bool alloc; int id; + spin_lock_irqsave(&nsid_lock, flags); + alloc = atomic_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); + spin_unlock_irqrestore(&nsid_lock, flags); if (alloc && id >= 0) rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; } EXPORT_SYMBOL(peernet2id_alloc); +/* This function returns, if assigned, the id of a peer netns. */ +static int peernet2id(struct net *net, struct net *peer) +{ + unsigned long flags; + int id; + + spin_lock_irqsave(&nsid_lock, flags); + id = __peernet2id(net, peer); + spin_unlock_irqrestore(&nsid_lock, flags); + return id; +} + struct net *get_net_ns_by_id(struct net *net, int id) { + unsigned long flags; struct net *peer; if (id < 0) return NULL; rcu_read_lock(); + spin_lock_irqsave(&nsid_lock, flags); peer = idr_find(&net->netns_ids, id); if (peer) get_net(peer); + spin_unlock_irqrestore(&nsid_lock, flags); rcu_read_unlock(); return peer; @@ -372,14 +394,19 @@ static void cleanup_net(struct work_struct *work) list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); for_each_net(tmp) { - int id = __peernet2id(tmp, net); + int id; - if (id >= 0) { - rtnl_net_notifyid(tmp, RTM_DELNSID, id); + spin_lock_irq(&nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) idr_remove(&tmp->netns_ids, id); - } + spin_unlock_irq(&nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id); } + spin_lock_irq(&nsid_lock); idr_destroy(&net->netns_ids); + spin_unlock_irq(&nsid_lock); } rtnl_unlock(); @@ -507,6 +534,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + unsigned long flags; struct net *peer; int nsid, err; @@ -527,12 +555,14 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); + spin_lock_irqsave(&nsid_lock, flags); if (__peernet2id(net, peer) >= 0) { err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); + spin_unlock_irqrestore(&nsid_lock, flags); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; @@ -601,7 +631,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - id = __peernet2id(net, peer); + id = peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, RTM_GETNSID, net, id); if (err < 0) @@ -654,10 +684,11 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .idx = 0, .s_idx = cb->args[0], }; + unsigned long flags; - ASSERT_RTNL(); - + spin_lock_irqsave(&nsid_lock, flags); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); + spin_unlock_irqrestore(&nsid_lock, flags); cb->args[0] = net_cb.idx; return skb->len; -- cgit v1.2.3-59-g8ed1b From cc3a572fe6cf586f478546215bc5d3694357d71e Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:52 +0200 Subject: netlink: rename private flags and states These flags and states have the same prefix (NETLINK_) that netlink socket options. To avoid confusion and to be able to name a flag like a socket option, let's use an other prefix: NETLINK_[S|F]_. Note: a comment has been fixed, it was talking about NETLINK_RECV_NO_ENOBUFS socket option instead of NETLINK_NO_ENOBUFS. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 59 ++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ec4adbdcb9b4..bf7f56d7a9aa 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -76,17 +76,17 @@ struct listeners { }; /* state bits */ -#define NETLINK_CONGESTED 0x0 +#define NETLINK_S_CONGESTED 0x0 /* flags */ -#define NETLINK_KERNEL_SOCKET 0x1 -#define NETLINK_RECV_PKTINFO 0x2 -#define NETLINK_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_KERNEL_SOCKET 0x1 +#define NETLINK_F_RECV_PKTINFO 0x2 +#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_F_RECV_NO_ENOBUFS 0x8 static inline int netlink_is_kernel(struct sock *sk) { - return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; + return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; } struct netlink_table *nl_table; @@ -256,8 +256,9 @@ static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_S_CONGESTED, + &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); } @@ -270,8 +271,8 @@ static void netlink_rcv_wake(struct sock *sk) struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(NETLINK_CONGESTED, &nlk->state); - if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + clear_bit(NETLINK_S_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } @@ -1656,7 +1657,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { @@ -1671,7 +1672,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -1895,7 +1896,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(NETLINK_CONGESTED, &nlk->state)) { + !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); @@ -1956,7 +1957,7 @@ static void do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); @@ -1966,7 +1967,7 @@ static void do_one_broadcast(struct sock *sk, p->skb2 = NULL; } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { netlink_overrun(sk); - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else { p->congested |= val; @@ -2057,7 +2058,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) !test_bit(p->group - 1, nlk->groups)) goto out; - if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) { + if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { ret = 1; goto out; } @@ -2076,7 +2077,7 @@ out: * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the - * NETLINK_RECV_NO_ENOBUFS socket option. + * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { @@ -2136,9 +2137,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: if (val) - nlk->flags |= NETLINK_RECV_PKTINFO; + nlk->flags |= NETLINK_F_RECV_PKTINFO; else - nlk->flags &= ~NETLINK_RECV_PKTINFO; + nlk->flags &= ~NETLINK_F_RECV_PKTINFO; err = 0; break; case NETLINK_ADD_MEMBERSHIP: @@ -2167,18 +2168,18 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } case NETLINK_BROADCAST_ERROR: if (val) - nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; else - nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; err = 0; break; case NETLINK_NO_ENOBUFS: if (val) { - nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(NETLINK_CONGESTED, &nlk->state); + nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; + clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { - nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } err = 0; break; @@ -2227,7 +2228,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2237,7 +2238,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2247,7 +2248,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2418,7 +2419,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*addr); } - if (nlk->flags & NETLINK_RECV_PKTINFO) + if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); memset(&scm, 0, sizeof(scm)); @@ -2502,7 +2503,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, goto out_sock_release; nlk = nlk_sk(sk); - nlk->flags |= NETLINK_KERNEL_SOCKET; + nlk->flags |= NETLINK_F_KERNEL_SOCKET; netlink_table_grab(); if (!nl_table[unit].registered) { -- cgit v1.2.3-59-g8ed1b From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:53 +0200 Subject: netlink: allow to listen "all" netns More accurately, listen all netns that have a nsid assigned into the netns where the netlink socket is opened. For this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this socket will receive netlink notifications from all netns that have a nsid assigned into the netns where the socket has been opened. The nsid is sent to userland via an anscillary data. With this patch, a daemon needs only one socket to listen many netns. This is useful when the number of netns is high. Because 0 is a valid value for a nsid, the field nsid_is_set indicates if the field nsid is valid or not. skb->cb is initialized to 0 on skb allocation, thus we are sure that we will never send a nsid 0 by error to the userland. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 ++ include/net/net_namespace.h | 2 ++ include/uapi/linux/netlink.h | 1 + net/core/net_namespace.c | 10 ++++++++- net/netlink/af_netlink.c | 52 +++++++++++++++++++++++++++++++++++++++----- 5 files changed, 61 insertions(+), 6 deletions(-) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 6835c1279df7..9120edb650a0 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -28,6 +28,8 @@ struct netlink_skb_parms { __u32 dst_group; __u32 flags; struct sock *sk; + bool nsid_is_set; + int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 6d1e2eae32fb..3f850acc844e 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -272,6 +272,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif int peernet2id_alloc(struct net *net, struct net *peer); +int peernet2id(struct net *net, struct net *peer); +bool peernet_has_id(struct net *net, struct net *peer); struct net *get_net_ns_by_id(struct net *net, int id); struct pernet_operations { diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 1a85940f8ab7..3e34b7d702f8 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -108,6 +108,7 @@ struct nlmsgerr { #define NETLINK_NO_ENOBUFS 5 #define NETLINK_RX_RING 6 #define NETLINK_TX_RING 7 +#define NETLINK_LISTEN_ALL_NSID 8 struct nl_pktinfo { __u32 group; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae5008b097de..a665bf490c88 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -229,7 +229,7 @@ int peernet2id_alloc(struct net *net, struct net *peer) EXPORT_SYMBOL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ -static int peernet2id(struct net *net, struct net *peer) +int peernet2id(struct net *net, struct net *peer) { unsigned long flags; int id; @@ -240,6 +240,14 @@ static int peernet2id(struct net *net, struct net *peer) return id; } +/* This function returns true is the peer netns has an id assigned into the + * current netns. + */ +bool peernet_has_id(struct net *net, struct net *peer) +{ + return peernet2id(net, peer) >= 0; +} + struct net *get_net_ns_by_id(struct net *net, int id) { unsigned long flags; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index bf7f56d7a9aa..a5fff75accf8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -83,6 +83,7 @@ struct listeners { #define NETLINK_F_RECV_PKTINFO 0x2 #define NETLINK_F_BROADCAST_SEND_ERROR 0x4 #define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 static inline int netlink_is_kernel(struct sock *sk) { @@ -1932,8 +1933,17 @@ static void do_one_broadcast(struct sock *sk, !test_bit(p->group - 1, nlk->groups)) return; - if (!net_eq(sock_net(sk), p->net)) - return; + if (!net_eq(sock_net(sk), p->net)) { + if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + return; + + if (!peernet_has_id(sock_net(sk), p->net)) + return; + + if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, + CAP_NET_BROADCAST)) + return; + } if (p->failure) { netlink_overrun(sk); @@ -1959,13 +1969,22 @@ static void do_one_broadcast(struct sock *sk, p->failure = 1; if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; - } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + goto out; + } + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if (sk_filter(sk, p->skb2)) { + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + goto out; + } + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + NETLINK_CB(p->skb2).nsid_is_set = true; + val = netlink_broadcast_deliver(sk, p->skb2); + if (val < 0) { netlink_overrun(sk); if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; @@ -1974,6 +1993,7 @@ static void do_one_broadcast(struct sock *sk, p->delivered = 1; p->skb2 = NULL; } +out: sock_put(sk); } @@ -2202,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; } #endif /* CONFIG_NETLINK_MMAP */ + case NETLINK_LISTEN_ALL_NSID: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) + return -EPERM; + + if (val) + nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; + else + nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2268,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } +static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + if (!NETLINK_CB(skb).nsid_is_set) + return; + + put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), + &NETLINK_CB(skb).nsid); +} + static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -2421,6 +2461,8 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); -- cgit v1.2.3-59-g8ed1b