Skip to content

Commit

Permalink
Merge branch 'netns-scalability'
Browse files Browse the repository at this point in the history
Nicolas Dichtel says:

====================
netns: ease netlink use with a lot of netns

This idea was informally discussed in Ottawa / netdev0.1. The goal is to
ease the use/scalability of netns, from a userland point of view.
Today, users need to open one netlink socket per family and per netns.
Thus, when the number of netns inscreases (for example 5K or more), the
number of sockets needed to manage them grows a lot.

The goal of this series is to be able to monitor netlink events, for a
specified family, for a set of netns, with only one netlink socket. For
this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID.
When this option is set on a netlink socket, this socket will receive
netlink notifications from all netns that have a nsid assigned into the
netns where the socket has been opened.
The nsid is sent to userland via an anscillary data.

Here is an example with a patched iproute2. vxlan10 is created in the
current netns (netns0, nsid 0) and then moved to another netns (netns1,
nsid 1):

$ ip netns exec netns0 ip monitor all-nsid label
[nsid 0][NSID]nsid 1 (iproute2 netns name: netns1)
[nsid 0][NEIGH]??? lladdr 00:00:00:00:00:00 REACHABLE,PERMANENT
[nsid 0][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff
[nsid 0][LINK]Deleted 5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff
[nsid 1][NSID]nsid 0 (iproute2 netns name: netns0)
[nsid 1][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff link-netnsid 0
[nsid 1][ADDR]5: vxlan10    inet 192.168.0.249/24 brd 192.168.0.255 scope global vxlan10
       valid_lft forever preferred_lft forever
[nsid 1][ROUTE]local 192.168.0.249 dev vxlan10  table local  proto kernel  scope host  src 192.168.0.249
[nsid 1][ROUTE]ff00::/8 dev vxlan10  table local  metric 256  pref medium
[nsid 1][ROUTE]2001:123::/64 dev vxlan10  proto kernel  metric 256  pref medium
[nsid 1][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UNKNOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff link-netnsid 0
[nsid 1][ROUTE]broadcast 192.168.0.255 dev vxlan10  table local  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]192.168.0.0/24 dev vxlan10  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]broadcast 192.168.0.0 dev vxlan10  table local  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]fe80::/64 dev vxlan10  proto kernel  metric 256  pref medium
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed May 10, 2015
2 parents 43996fd + 59324cf commit 4d95b72
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 82 deletions.
2 changes: 1 addition & 1 deletion drivers/net/vxlan.c
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,

if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
nla_put_s32(skb, NDA_LINK_NETNSID,
peernet2id(dev_net(vxlan->dev), vxlan->net)))
peernet2id_alloc(dev_net(vxlan->dev), vxlan->net)))
goto nla_put_failure;

if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
Expand Down
2 changes: 2 additions & 0 deletions include/linux/netlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ struct netlink_skb_parms {
__u32 dst_group;
__u32 flags;
struct sock *sk;
bool nsid_is_set;
int nsid;
};

#define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb))
Expand Down
2 changes: 2 additions & 0 deletions include/net/net_namespace.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,9 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
#define __net_initconst __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer);
int peernet2id(struct net *net, struct net *peer);
bool peernet_has_id(struct net *net, struct net *peer);
struct net *get_net_ns_by_id(struct net *net, int id);

struct pernet_operations {
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/netlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ struct nlmsgerr {
#define NETLINK_NO_ENOBUFS 5
#define NETLINK_RX_RING 6
#define NETLINK_TX_RING 7
#define NETLINK_LISTEN_ALL_NSID 8

struct nl_pktinfo {
__u32 group;
Expand Down
132 changes: 86 additions & 46 deletions net/core/net_namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
DEFINE_MUTEX(net_mutex);
static DEFINE_SPINLOCK(nsid_lock);

LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
Expand Down Expand Up @@ -147,24 +148,17 @@ static void ops_free_list(const struct pernet_operations *ops,
}
}

static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
int id);
/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
int min = 0, max = 0, id;

ASSERT_RTNL();
int min = 0, max = 0;

if (reqid >= 0) {
min = reqid;
max = reqid + 1;
}

id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
if (id >= 0)
rtnl_net_notifyid(net, peer, RTM_NEWNSID, id);

return id;
return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}

/* This function is used by idr_for_each(). If net is equal to peer, the
Expand All @@ -180,48 +174,94 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0;
}

static int __peernet2id(struct net *net, struct net *peer, bool alloc)
/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
* is set to true, thus the caller knows that the new id must be notified via
* rtnl.
*/
static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
{
int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
bool alloc_it = *alloc;

ASSERT_RTNL();
*alloc = false;

/* Magic value for id 0. */
if (id == NET_ID_ZERO)
return 0;
if (id > 0)
return id;

if (alloc)
return alloc_netid(net, peer, -1);
if (alloc_it) {
id = alloc_netid(net, peer, -1);
*alloc = true;
return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
}

return NETNSA_NSID_NOT_ASSIGNED;
}

/* should be called with nsid_lock held */
static int __peernet2id(struct net *net, struct net *peer)
{
bool no = false;

return -ENOENT;
return __peernet2id_alloc(net, peer, &no);
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id);
/* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned.
*/
int peernet2id_alloc(struct net *net, struct net *peer)
{
unsigned long flags;
bool alloc;
int id;

spin_lock_irqsave(&nsid_lock, flags);
alloc = atomic_read(&peer->count) == 0 ? false : true;
id = __peernet2id_alloc(net, peer, &alloc);
spin_unlock_irqrestore(&nsid_lock, flags);
if (alloc && id >= 0)
rtnl_net_notifyid(net, RTM_NEWNSID, id);
return id;
}
EXPORT_SYMBOL(peernet2id_alloc);

/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
{
bool alloc = atomic_read(&peer->count) == 0 ? false : true;
unsigned long flags;
int id;

id = __peernet2id(net, peer, alloc);
return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
spin_lock_irqsave(&nsid_lock, flags);
id = __peernet2id(net, peer);
spin_unlock_irqrestore(&nsid_lock, flags);
return id;
}

/* This function returns true is the peer netns has an id assigned into the
* current netns.
*/
bool peernet_has_id(struct net *net, struct net *peer)
{
return peernet2id(net, peer) >= 0;
}
EXPORT_SYMBOL(peernet2id);

struct net *get_net_ns_by_id(struct net *net, int id)
{
unsigned long flags;
struct net *peer;

if (id < 0)
return NULL;

rcu_read_lock();
spin_lock_irqsave(&nsid_lock, flags);
peer = idr_find(&net->netns_ids, id);
if (peer)
get_net(peer);
spin_unlock_irqrestore(&nsid_lock, flags);
rcu_read_unlock();

return peer;
Expand Down Expand Up @@ -362,14 +402,19 @@ static void cleanup_net(struct work_struct *work)
list_del_rcu(&net->list);
list_add_tail(&net->exit_list, &net_exit_list);
for_each_net(tmp) {
int id = __peernet2id(tmp, net, false);
int id;

if (id >= 0) {
rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
spin_lock_irq(&nsid_lock);
id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
}
spin_unlock_irq(&nsid_lock);
if (id >= 0)
rtnl_net_notifyid(tmp, RTM_DELNSID, id);
}
spin_lock_irq(&nsid_lock);
idr_destroy(&net->netns_ids);
spin_unlock_irq(&nsid_lock);

}
rtnl_unlock();
Expand Down Expand Up @@ -497,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
unsigned long flags;
struct net *peer;
int nsid, err;

Expand All @@ -517,14 +563,18 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
if (IS_ERR(peer))
return PTR_ERR(peer);

if (__peernet2id(net, peer, false) >= 0) {
spin_lock_irqsave(&nsid_lock, flags);
if (__peernet2id(net, peer) >= 0) {
err = -EEXIST;
goto out;
}

err = alloc_netid(net, peer, nsid);
if (err > 0)
spin_unlock_irqrestore(&nsid_lock, flags);
if (err >= 0) {
rtnl_net_notifyid(net, RTM_NEWNSID, err);
err = 0;
}
out:
put_net(peer);
return err;
Expand All @@ -538,14 +588,10 @@ static int rtnl_net_get_size(void)
}

static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
int cmd, struct net *net, struct net *peer,
int nsid)
int cmd, struct net *net, int nsid)
{
struct nlmsghdr *nlh;
struct rtgenmsg *rth;
int id;

ASSERT_RTNL();

nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
if (!nlh)
Expand All @@ -554,14 +600,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
rth = nlmsg_data(nlh);
rth->rtgen_family = AF_UNSPEC;

if (nsid >= 0) {
id = nsid;
} else {
id = __peernet2id(net, peer, false);
if (id < 0)
id = NETNSA_NSID_NOT_ASSIGNED;
}
if (nla_put_s32(skb, NETNSA_NSID, id))
if (nla_put_s32(skb, NETNSA_NSID, nsid))
goto nla_put_failure;

nlmsg_end(skb, nlh);
Expand All @@ -578,7 +617,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *tb[NETNSA_MAX + 1];
struct sk_buff *msg;
struct net *peer;
int err;
int err, id;

err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
rtnl_net_policy);
Expand All @@ -600,8 +639,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
goto out;
}

id = peernet2id(net, peer);
err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
RTM_GETNSID, net, peer, -1);
RTM_GETNSID, net, id);
if (err < 0)
goto err_out;

Expand Down Expand Up @@ -633,7 +673,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)

ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWNSID, net_cb->net, peer, id);
RTM_NEWNSID, net_cb->net, id);
if (ret < 0)
return ret;

Expand All @@ -652,17 +692,17 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
.idx = 0,
.s_idx = cb->args[0],
};
unsigned long flags;

ASSERT_RTNL();

spin_lock_irqsave(&nsid_lock, flags);
idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
spin_unlock_irqrestore(&nsid_lock, flags);

cb->args[0] = net_cb.idx;
return skb->len;
}

static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
int id)
static void rtnl_net_notifyid(struct net *net, int cmd, int id)
{
struct sk_buff *msg;
int err = -ENOMEM;
Expand All @@ -671,7 +711,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
if (!msg)
goto out;

err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id);
err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
if (err < 0)
goto err_out;

Expand Down
2 changes: 1 addition & 1 deletion net/core/rtnetlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);

if (!net_eq(dev_net(dev), link_net)) {
int id = peernet2id(dev_net(dev), link_net);
int id = peernet2id_alloc(dev_net(dev), link_net);

if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
goto nla_put_failure;
Expand Down
Loading

0 comments on commit 4d95b72

Please sign in to comment.