Skip to content

Commit

Permalink
Merge branch 'netdev-adjacency'
Browse files Browse the repository at this point in the history
David Ahern says:

====================
net: Fix netdev adjacency tracking

The netdev adjacency tracking is failing to create proper dependencies
for some topologies. For example this topology

        +--------+
        |  myvrf |
        +--------+
          |    |
          |  +---------+
          |  | macvlan |
          |  +---------+
          |    |
      +----------+
      |  bridge  |
      +----------+
          |
      +--------+
      | bond1  |
      +--------+
          |
      +--------+
      |  eth3  |
      +--------+

hits 1 of 2 problems depending on the order of enslavement. The base set of
commands for both cases:

    ip link add bond1 type bond
    ip link set bond1 up
    ip link set eth3 down
    ip link set eth3 master bond1
    ip link set eth3 up

    ip link add bridge type bridge
    ip link set bridge up
    ip link add macvlan link bridge type macvlan
    ip link set macvlan up

    ip link add myvrf type vrf table 1234
    ip link set myvrf up

    ip link set bridge master myvrf

Case 1 enslave macvlan to the vrf before enslaving the bond to the bridge:

    ip link set macvlan master myvrf
    ip link set bond1 master bridge

Attempts to delete the VRF:
    ip link delete myvrf

trigger the BUG in __netdev_adjacent_dev_remove:

[  587.405260] tried to remove device eth3 from myvrf
[  587.407269] ------------[ cut here ]------------
[  587.408918] kernel BUG at /home/dsa/kernel.git/net/core/dev.c:5661!
[  587.411113] invalid opcode: 0000 [#1] SMP
[  587.412454] Modules linked in: macvlan bridge stp llc bonding vrf
[  587.414765] CPU: 0 PID: 726 Comm: ip Not tainted 4.8.0+ #109
[  587.416766] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[  587.420241] task: ffff88013ab6eec0 task.stack: ffffc90000628000
[  587.422163] RIP: 0010:[<ffffffff813cef03>]  [<ffffffff813cef03>] __netdev_adjacent_dev_remove+0x40/0x12c
...
[  587.446053] Call Trace:
[  587.446424]  [<ffffffff813d1542>] __netdev_adjacent_dev_unlink+0x20/0x3c
[  587.447390]  [<ffffffff813d16a3>] netdev_upper_dev_unlink+0xfa/0x15e
[  587.448297]  [<ffffffffa00003a3>] vrf_del_slave+0x13/0x2a [vrf]
[  587.449153]  [<ffffffffa00004a4>] vrf_dev_uninit+0xea/0x114 [vrf]
[  587.450036]  [<ffffffff813d19b0>] rollback_registered_many+0x22b/0x2da
[  587.450974]  [<ffffffff813d1aac>] unregister_netdevice_many+0x17/0x48
[  587.451903]  [<ffffffff813de444>] rtnl_delete_link+0x3c/0x43
[  587.452719]  [<ffffffff813dedcd>] rtnl_dellink+0x180/0x194

When the BUG is converted to a WARN_ON it shows 4 missing adjacencies:
  eth3 - myvrf, mvrf - eth3, bond1 - myvrf and myvrf - bond1

All of those are because the __netdev_upper_dev_link function does not
properly link macvlan lower devices to myvrf when it is enslaved.

The second case just flips the ordering of the enslavements:
    ip link set bond1 master bridge
    ip link set macvlan master myvrf

Then run:
    ip link delete bond1
    ip link delete myvrf

The vrf delete command hangs because myvrf has a reference that has not
been released. In this case the removal code does not account for 2 paths
between eth3 and myvrf - one from bridge to vrf and the other through the
macvlan.

Rather than try to maintain a linked list of all upper and lower devices
per netdevice, only track the direct neighbors. The remaining stack can
be determined by recursively walking the neighbors.

The existing netdev_for_each_all_upper_dev_rcu,
netdev_for_each_all_lower_dev and netdev_for_each_all_lower_dev_rcu macros
are replaced with APIs that walk the upper and lower device lists. The
new APIs take a callback function and a data arg that is passed to the
callback for each device in the list. Drivers using the old macros are
converted in separate patches to make it easier on reviewers. It is an
API conversion only; no functional change is intended.

v3
- address Stephen's comment to simplify logic and remove typecasts

v2
- fixed bond0 references in cover-letter
- fixed definition of netdev_next_lower_dev_rcu to mirror the upper_dev
  version.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Oct 18, 2016
2 parents 5921a0f + 67b62f9 commit 5bb61cb
Show file tree
Hide file tree
Showing 10 changed files with 423 additions and 352 deletions.
9 changes: 1 addition & 8 deletions drivers/infiniband/core/core_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,7 @@ void ib_cache_release_one(struct ib_device *device);
static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
struct net_device *upper)
{
struct net_device *_upper = NULL;
struct list_head *iter;

netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
if (_upper == upper)
break;

return _upper == upper;
return netdev_has_upper_dev_all_rcu(dev, upper);
}

int addr_init(void);
Expand Down
42 changes: 23 additions & 19 deletions drivers/infiniband/core/roce_gid_mgmt.c
Original file line number Diff line number Diff line change
Expand Up @@ -437,37 +437,41 @@ static void callback_for_addr_gid_device_scan(struct ib_device *device,
&parsed->gid_attr);
}

struct upper_list {
struct list_head list;
struct net_device *upper;
};

static int netdev_upper_walk(struct net_device *upper, void *data)
{
struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
struct list_head *upper_list = data;

if (!entry) {
pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
return 0;
}

list_add_tail(&entry->list, upper_list);
dev_hold(upper);
entry->upper = upper;

return 0;
}

static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
void *cookie,
void (*handle_netdev)(struct ib_device *ib_dev,
u8 port,
struct net_device *ndev))
{
struct net_device *ndev = (struct net_device *)cookie;
struct upper_list {
struct list_head list;
struct net_device *upper;
};
struct net_device *upper;
struct list_head *iter;
struct upper_list *upper_iter;
struct upper_list *upper_temp;
LIST_HEAD(upper_list);

rcu_read_lock();
netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
struct upper_list *entry = kmalloc(sizeof(*entry),
GFP_ATOMIC);

if (!entry) {
pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
continue;
}

list_add_tail(&entry->list, &upper_list);
dev_hold(upper);
entry->upper = upper;
}
netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list);
rcu_read_unlock();

handle_netdev(ib_dev, port, ndev);
Expand Down
37 changes: 25 additions & 12 deletions drivers/infiniband/ulp/ipoib/ipoib_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,25 @@ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
return dev;
}

struct ipoib_walk_data {
const struct sockaddr *addr;
struct net_device *result;
};

static int ipoib_upper_walk(struct net_device *upper, void *_data)
{
struct ipoib_walk_data *data = _data;
int ret = 0;

if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
dev_hold(upper);
data->result = upper;
ret = 1;
}

return ret;
}

/**
* Find a net_device matching the given address, which is an upper device of
* the given net_device.
Expand All @@ -304,27 +323,21 @@ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
static struct net_device *ipoib_get_net_dev_match_addr(
const struct sockaddr *addr, struct net_device *dev)
{
struct net_device *upper,
*result = NULL;
struct list_head *iter;
struct ipoib_walk_data data = {
.addr = addr,
};

rcu_read_lock();
if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
dev_hold(dev);
result = dev;
data.result = dev;
goto out;
}

netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
dev_hold(upper);
result = upper;
break;
}
}
netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data);
out:
rcu_read_unlock();
return result;
return data.result;
}

/* returns the number of IPoIB netdevs on top a given ipoib device matching a
Expand Down
82 changes: 52 additions & 30 deletions drivers/net/bonding/bond_alb.c
Original file line number Diff line number Diff line change
Expand Up @@ -950,13 +950,61 @@ static void alb_send_lp_vid(struct slave *slave, u8 mac_addr[],
dev_queue_xmit(skb);
}

struct alb_walk_data {
struct bonding *bond;
struct slave *slave;
u8 *mac_addr;
bool strict_match;
};

static int alb_upper_dev_walk(struct net_device *upper, void *_data)
{
struct alb_walk_data *data = _data;
bool strict_match = data->strict_match;
struct bonding *bond = data->bond;
struct slave *slave = data->slave;
u8 *mac_addr = data->mac_addr;
struct bond_vlan_tag *tags;

if (is_vlan_dev(upper) && vlan_get_encap_level(upper) == 0) {
if (strict_match &&
ether_addr_equal_64bits(mac_addr,
upper->dev_addr)) {
alb_send_lp_vid(slave, mac_addr,
vlan_dev_vlan_proto(upper),
vlan_dev_vlan_id(upper));
} else if (!strict_match) {
alb_send_lp_vid(slave, upper->dev_addr,
vlan_dev_vlan_proto(upper),
vlan_dev_vlan_id(upper));
}
}

/* If this is a macvlan device, then only send updates
* when strict_match is turned off.
*/
if (netif_is_macvlan(upper) && !strict_match) {
tags = bond_verify_device_path(bond->dev, upper, 0);
if (IS_ERR_OR_NULL(tags))
BUG();
alb_send_lp_vid(slave, upper->dev_addr,
tags[0].vlan_proto, tags[0].vlan_id);
kfree(tags);
}

return 0;
}

static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[],
bool strict_match)
{
struct bonding *bond = bond_get_bond_by_slave(slave);
struct net_device *upper;
struct list_head *iter;
struct bond_vlan_tag *tags;
struct alb_walk_data data = {
.strict_match = strict_match,
.mac_addr = mac_addr,
.slave = slave,
.bond = bond,
};

/* send untagged */
alb_send_lp_vid(slave, mac_addr, 0, 0);
Expand All @@ -965,33 +1013,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[],
* for that device.
*/
rcu_read_lock();
netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) {
if (is_vlan_dev(upper) && vlan_get_encap_level(upper) == 0) {
if (strict_match &&
ether_addr_equal_64bits(mac_addr,
upper->dev_addr)) {
alb_send_lp_vid(slave, mac_addr,
vlan_dev_vlan_proto(upper),
vlan_dev_vlan_id(upper));
} else if (!strict_match) {
alb_send_lp_vid(slave, upper->dev_addr,
vlan_dev_vlan_proto(upper),
vlan_dev_vlan_id(upper));
}
}

/* If this is a macvlan device, then only send updates
* when strict_match is turned off.
*/
if (netif_is_macvlan(upper) && !strict_match) {
tags = bond_verify_device_path(bond->dev, upper, 0);
if (IS_ERR_OR_NULL(tags))
BUG();
alb_send_lp_vid(slave, upper->dev_addr,
tags[0].vlan_proto, tags[0].vlan_id);
kfree(tags);
}
}
netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &data);
rcu_read_unlock();
}

Expand Down
17 changes: 9 additions & 8 deletions drivers/net/bonding/bond_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2270,22 +2270,23 @@ static void bond_mii_monitor(struct work_struct *work)
}
}

static int bond_upper_dev_walk(struct net_device *upper, void *data)
{
__be32 ip = *((__be32 *)data);

return ip == bond_confirm_addr(upper, 0, ip);
}

static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
{
struct net_device *upper;
struct list_head *iter;
bool ret = false;

if (ip == bond_confirm_addr(bond->dev, 0, ip))
return true;

rcu_read_lock();
netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) {
if (ip == bond_confirm_addr(upper, 0, ip)) {
ret = true;
break;
}
}
if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &ip))
ret = true;
rcu_read_unlock();

return ret;
Expand Down
Loading

0 comments on commit 5bb61cb

Please sign in to comment.