Skip to content

Commit

Permalink
net/mlx5: Lag, properly lock eswitch if needed
Browse files Browse the repository at this point in the history
Currently when doing hardware lag we check the eswitch mode
but as this isn't done under a lock the check isn't valid.

As the code needs to sync between two different devices an extra
care is needed.

- When going to change eswitch mode, if hardware lag is active destroy it.
- While changing eswitch modes block any hardware bond creation.
- Delay handling bonding events until there are no mode changes in
  progress.
- When attaching a new mdev to lag, block until there is no mode change
  in progress. In order for the mode change to finish the interface lock
  will have to be taken. Release the lock and sleep for 100ms to
  allow forward progress. As this is a very rare condition (can happen if
  the user unbinds and binds a PCI function while also changing eswitch
  mode of the other PCI function) it has no real world impact.

As taking multiple eswitch mode locks is now required lockdep will
complain about a possible deadlock. Register a key per eswitch to make
lockdep happy.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
  • Loading branch information
Mark Bloch authored and Saeed Mahameed committed Aug 5, 2021
1 parent 898b078 commit cac1eb2
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 18 deletions.
24 changes: 20 additions & 4 deletions drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
Original file line number Diff line number Diff line change
Expand Up @@ -1458,8 +1458,6 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs)

esw->mode = mode;

mlx5_lag_update(esw->dev);

if (mode == MLX5_ESWITCH_LEGACY) {
err = esw_legacy_enable(esw);
} else {
Expand Down Expand Up @@ -1506,6 +1504,7 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
if (!mlx5_esw_allowed(esw))
return 0;

mlx5_lag_disable_change(esw->dev);
down_write(&esw->mode_lock);
if (esw->mode == MLX5_ESWITCH_NONE) {
ret = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY, num_vfs);
Expand All @@ -1519,6 +1518,7 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
esw->esw_funcs.num_vfs = num_vfs;
}
up_write(&esw->mode_lock);
mlx5_lag_enable_change(esw->dev);
return ret;
}

Expand Down Expand Up @@ -1550,8 +1550,6 @@ void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw, bool clear_vf)
old_mode = esw->mode;
esw->mode = MLX5_ESWITCH_NONE;

mlx5_lag_update(esw->dev);

if (old_mode == MLX5_ESWITCH_OFFLOADS)
mlx5_rescan_drivers(esw->dev);

Expand All @@ -1567,10 +1565,12 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf)
if (!mlx5_esw_allowed(esw))
return;

mlx5_lag_disable_change(esw->dev);
down_write(&esw->mode_lock);
mlx5_eswitch_disable_locked(esw, clear_vf);
esw->esw_funcs.num_vfs = 0;
up_write(&esw->mode_lock);
mlx5_lag_enable_change(esw->dev);
}

static int mlx5_query_hca_cap_host_pf(struct mlx5_core_dev *dev, void *out)
Expand Down Expand Up @@ -1759,7 +1759,9 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
ida_init(&esw->offloads.vport_metadata_ida);
xa_init_flags(&esw->offloads.vhca_map, XA_FLAGS_ALLOC);
mutex_init(&esw->state_lock);
lockdep_register_key(&esw->mode_lock_key);
init_rwsem(&esw->mode_lock);
lockdep_set_class(&esw->mode_lock, &esw->mode_lock_key);

esw->enabled_vports = 0;
esw->mode = MLX5_ESWITCH_NONE;
Expand Down Expand Up @@ -1793,6 +1795,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)

esw->dev->priv.eswitch = NULL;
destroy_workqueue(esw->work_queue);
lockdep_unregister_key(&esw->mode_lock_key);
mutex_destroy(&esw->state_lock);
WARN_ON(!xa_empty(&esw->offloads.vhca_map));
xa_destroy(&esw->offloads.vhca_map);
Expand Down Expand Up @@ -2366,9 +2369,22 @@ int mlx5_esw_try_lock(struct mlx5_eswitch *esw)
*/
void mlx5_esw_unlock(struct mlx5_eswitch *esw)
{
if (!mlx5_esw_allowed(esw))
return;
up_write(&esw->mode_lock);
}

/**
* mlx5_esw_lock() - Take write lock on esw mode lock
* @esw: eswitch device.
*/
void mlx5_esw_lock(struct mlx5_eswitch *esw)
{
if (!mlx5_esw_allowed(esw))
return;
down_write(&esw->mode_lock);
}

/**
* mlx5_eswitch_get_total_vports - Get total vports of the eswitch
*
Expand Down
5 changes: 5 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ struct mlx5_eswitch {
u32 large_group_num;
} params;
struct blocking_notifier_head n_head;
struct lock_class_key mode_lock_key;
};

void esw_offloads_disable(struct mlx5_eswitch *esw);
Expand Down Expand Up @@ -707,6 +708,7 @@ void mlx5_esw_get(struct mlx5_core_dev *dev);
void mlx5_esw_put(struct mlx5_core_dev *dev);
int mlx5_esw_try_lock(struct mlx5_eswitch *esw);
void mlx5_esw_unlock(struct mlx5_eswitch *esw);
void mlx5_esw_lock(struct mlx5_eswitch *esw);

void esw_vport_change_handle_locked(struct mlx5_vport *vport);

Expand All @@ -727,6 +729,9 @@ static inline const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev)
return ERR_PTR(-EOPNOTSUPP);
}

static inline void mlx5_esw_unlock(struct mlx5_eswitch *esw) { return; }
static inline void mlx5_esw_lock(struct mlx5_eswitch *esw) { return; }

static inline struct mlx5_flow_handle *
esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag)
{
Expand Down
5 changes: 4 additions & 1 deletion drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
Original file line number Diff line number Diff line change
Expand Up @@ -3051,10 +3051,11 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
if (esw_mode_from_devlink(mode, &mlx5_mode))
return -EINVAL;

mlx5_lag_disable_change(esw->dev);
err = mlx5_esw_try_lock(esw);
if (err < 0) {
NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy");
return err;
goto enable_lag;
}
cur_mlx5_mode = err;
err = 0;
Expand All @@ -3071,6 +3072,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,

unlock:
mlx5_esw_unlock(esw);
enable_lag:
mlx5_lag_enable_change(esw->dev);
return err;
}

Expand Down
83 changes: 72 additions & 11 deletions drivers/net/ethernet/mellanox/mlx5/core/lag.c
Original file line number Diff line number Diff line change
Expand Up @@ -418,21 +418,48 @@ static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
}

static void mlx5_lag_lock_eswitches(struct mlx5_core_dev *dev0,
struct mlx5_core_dev *dev1)
{
if (dev0)
mlx5_esw_lock(dev0->priv.eswitch);
if (dev1)
mlx5_esw_lock(dev1->priv.eswitch);
}

static void mlx5_lag_unlock_eswitches(struct mlx5_core_dev *dev0,
struct mlx5_core_dev *dev1)
{
if (dev1)
mlx5_esw_unlock(dev1->priv.eswitch);
if (dev0)
mlx5_esw_unlock(dev0->priv.eswitch);
}

static void mlx5_do_bond_work(struct work_struct *work)
{
struct delayed_work *delayed_work = to_delayed_work(work);
struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
bond_work);
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
int status;

status = mlx5_dev_list_trylock();
if (!status) {
/* 1 sec delay. */
mlx5_queue_bond_work(ldev, HZ);
return;
}

if (ldev->mode_changes_in_progress) {
mlx5_dev_list_unlock();
mlx5_queue_bond_work(ldev, HZ);
return;
}

mlx5_lag_lock_eswitches(dev0, dev1);
mlx5_do_bond(ldev);
mlx5_lag_unlock_eswitches(dev0, dev1);
mlx5_dev_list_unlock();
}

Expand Down Expand Up @@ -630,15 +657,15 @@ static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
}

/* Must be called with intf_mutex held */
static void __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
{
struct mlx5_lag *ldev = NULL;
struct mlx5_core_dev *tmp_dev;

if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
!MLX5_CAP_GEN(dev, lag_master) ||
MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS)
return;
return 0;

tmp_dev = mlx5_get_next_phys_dev(dev);
if (tmp_dev)
Expand All @@ -648,15 +675,17 @@ static void __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
ldev = mlx5_lag_dev_alloc(dev);
if (!ldev) {
mlx5_core_err(dev, "Failed to alloc lag dev\n");
return;
return 0;
}
} else {
if (ldev->mode_changes_in_progress)
return -EAGAIN;
mlx5_ldev_get(ldev);
}

mlx5_ldev_add_mdev(ldev, dev);

return;
return 0;
}

void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
Expand All @@ -667,16 +696,30 @@ void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
if (!ldev)
return;

recheck:
mlx5_dev_list_lock();
if (ldev->mode_changes_in_progress) {
mlx5_dev_list_unlock();
msleep(100);
goto recheck;
}
mlx5_ldev_remove_mdev(ldev, dev);
mlx5_dev_list_unlock();
mlx5_ldev_put(ldev);
}

void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
{
int err;

recheck:
mlx5_dev_list_lock();
__mlx5_lag_dev_add_mdev(dev);
err = __mlx5_lag_dev_add_mdev(dev);
if (err) {
mlx5_dev_list_unlock();
msleep(100);
goto recheck;
}
mlx5_dev_list_unlock();
}

Expand Down Expand Up @@ -716,6 +759,7 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,

if (i >= MLX5_MAX_PORTS)
ldev->flags |= MLX5_LAG_FLAG_READY;
mlx5_queue_bond_work(ldev, 0);
}

bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
Expand Down Expand Up @@ -789,19 +833,36 @@ bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
}
EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);

void mlx5_lag_update(struct mlx5_core_dev *dev)
void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
{
struct mlx5_core_dev *dev0;
struct mlx5_core_dev *dev1;
struct mlx5_lag *ldev;

mlx5_dev_list_lock();

ldev = mlx5_lag_dev(dev);
if (!ldev)
goto unlock;
dev0 = ldev->pf[MLX5_LAG_P1].dev;
dev1 = ldev->pf[MLX5_LAG_P2].dev;

mlx5_do_bond(ldev);
ldev->mode_changes_in_progress++;
if (__mlx5_lag_is_active(ldev)) {
mlx5_lag_lock_eswitches(dev0, dev1);
mlx5_disable_lag(ldev);
mlx5_lag_unlock_eswitches(dev0, dev1);
}
mlx5_dev_list_unlock();
}

unlock:
void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
{
struct mlx5_lag *ldev;

mlx5_dev_list_lock();
ldev = mlx5_lag_dev(dev);
ldev->mode_changes_in_progress--;
mlx5_dev_list_unlock();
mlx5_queue_bond_work(ldev, 0);
}

struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/lag.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct lag_tracker {
*/
struct mlx5_lag {
u8 flags;
int mode_changes_in_progress;
bool shared_fdb;
u8 v2p_map[MLX5_MAX_PORTS];
struct kref ref;
Expand Down
5 changes: 3 additions & 2 deletions drivers/net/ethernet/mellanox/mlx5/core/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1179,18 +1179,19 @@ static int mlx5_load(struct mlx5_core_dev *dev)
goto err_ec;
}

mlx5_lag_add_mdev(dev);
err = mlx5_sriov_attach(dev);
if (err) {
mlx5_core_err(dev, "sriov init failed %d\n", err);
goto err_sriov;
}

mlx5_sf_dev_table_create(dev);
mlx5_lag_add_mdev(dev);

return 0;

err_sriov:
mlx5_lag_remove_mdev(dev);
mlx5_ec_cleanup(dev);
err_ec:
mlx5_sf_hw_table_destroy(dev);
Expand Down Expand Up @@ -1222,9 +1223,9 @@ static int mlx5_load(struct mlx5_core_dev *dev)

static void mlx5_unload(struct mlx5_core_dev *dev)
{
mlx5_lag_remove_mdev(dev);
mlx5_sf_dev_table_destroy(dev);
mlx5_sriov_detach(dev);
mlx5_lag_remove_mdev(dev);
mlx5_ec_cleanup(dev);
mlx5_sf_hw_table_destroy(dev);
mlx5_vhca_event_stop(dev);
Expand Down
2 changes: 2 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, struct net_device *netdev);
void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, struct net_device *netdev);
void mlx5_lag_add_mdev(struct mlx5_core_dev *dev);
void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev);
void mlx5_lag_disable_change(struct mlx5_core_dev *dev);
void mlx5_lag_enable_change(struct mlx5_core_dev *dev);

int mlx5_events_init(struct mlx5_core_dev *dev);
void mlx5_events_cleanup(struct mlx5_core_dev *dev);
Expand Down

0 comments on commit cac1eb2

Please sign in to comment.