Skip to content

Commit

Permalink
net/mlx5: Fix race between PCI error handlers and health work
Browse files Browse the repository at this point in the history
Currently there is a race between the health care work and the kernel
pci error handlers because both of them detect the error, the first one
to be called will do the error handling.
There is a chance that health care will disable the pci after resuming
pci slot.
Also create a separate WQ because now we will have two types of health
works, one for the error detection and one for the recovery.

Fixes: 89d44f0 ('net/mlx5_core: Add pci error handlers to mlx5_core driver')
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Mohamad Haj Yahia authored and David S. Miller committed Oct 29, 2016
1 parent 2241007 commit 05ac2c0
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
30 changes: 27 additions & 3 deletions drivers/net/ethernet/mellanox/mlx5/core/health.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ enum {
MLX5_NIC_IFC_NO_DRAM_NIC = 2
};

enum {
MLX5_DROP_NEW_HEALTH_WORK,
};

static u8 get_nic_interface(struct mlx5_core_dev *dev)
{
return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
Expand Down Expand Up @@ -272,7 +276,13 @@ static void poll_health(unsigned long data)
if (in_fatal(dev) && !health->sick) {
health->sick = true;
print_health_info(dev);
schedule_work(&health->work);
spin_lock(&health->wq_lock);
if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
queue_work(health->wq, &health->work);
else
dev_err(&dev->pdev->dev,
"new health works are not permitted at this stage\n");
spin_unlock(&health->wq_lock);
}
}

Expand All @@ -282,6 +292,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)

init_timer(&health->timer);
health->sick = 0;
clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
health->health = &dev->iseg->health;
health->health_counter = &dev->iseg->health_counter;

Expand All @@ -298,11 +309,21 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
del_timer_sync(&health->timer);
}

void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;

spin_lock(&health->wq_lock);
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
spin_unlock(&health->wq_lock);
cancel_work_sync(&health->work);
}

void mlx5_health_cleanup(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;

flush_work(&health->work);
destroy_workqueue(health->wq);
}

int mlx5_health_init(struct mlx5_core_dev *dev)
Expand All @@ -317,8 +338,11 @@ int mlx5_health_init(struct mlx5_core_dev *dev)

strcpy(name, "mlx5_health");
strcat(name, dev_name(&dev->pdev->dev));
health->wq = create_singlethread_workqueue(name);
kfree(name);

if (!health->wq)
return -ENOMEM;
spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_care);

return 0;
Expand Down
9 changes: 7 additions & 2 deletions drivers/net/ethernet/mellanox/mlx5/core/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
dev_info(&pdev->dev, "%s was called\n", __func__);
mlx5_enter_error_state(dev);
mlx5_unload_one(dev, priv, false);
pci_save_state(pdev);
mlx5_pci_disable_device(dev);
/* In case of kernel call save the pci state and drain health wq */
if (state) {
pci_save_state(pdev);
mlx5_drain_health_wq(dev);
mlx5_pci_disable_device(dev);
}

return state == pci_channel_io_perm_failure ?
PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
}
Expand Down
4 changes: 4 additions & 0 deletions include/linux/mlx5/driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,10 @@ struct mlx5_core_health {
u32 prev;
int miss_counter;
bool sick;
/* wq spinlock to synchronize draining */
spinlock_t wq_lock;
struct workqueue_struct *wq;
unsigned long flags;
struct work_struct work;
};

Expand Down Expand Up @@ -778,6 +781,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
int mlx5_health_init(struct mlx5_core_dev *dev);
void mlx5_start_health_poll(struct mlx5_core_dev *dev);
void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
struct mlx5_buf *buf, int node);
int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
Expand Down

0 comments on commit 05ac2c0

Please sign in to comment.