Skip to content

Commit

Permalink
net/mlx5: Read timeout values from DTOR
Browse files Browse the repository at this point in the history
Replace hard coded timeouts with values stored by firmware in default
timeouts register (DTOR). Timeouts are read during driver load. If DTOR
is not supported by firmware then fallback to hard coded defaults
instead.

Signed-off-by: Amir Tzin <amirtz@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
  • Loading branch information
Amir Tzin authored and Saeed Mahameed committed Oct 16, 2021
1 parent 5945e1a commit 32def41
Show file tree
Hide file tree
Showing 10 changed files with 124 additions and 40 deletions.
1 change: 0 additions & 1 deletion drivers/net/ethernet/mellanox/mlx5/core/en/health.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq);
void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq);

#define MLX5E_REPORTER_PER_Q_MAX_LEN 256
#define MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC 2000

struct mlx5e_err_ctx {
int (*recover)(void *ctx);
Expand Down
7 changes: 5 additions & 2 deletions drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "txrx.h"
#include "devlink.h"
#include "ptp.h"
#include "lib/tout.h"

static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state)
{
Expand All @@ -32,8 +33,10 @@ static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state)

static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq)
{
unsigned long exp_time = jiffies +
msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
struct mlx5_core_dev *dev = icosq->channel->mdev;
unsigned long exp_time;

exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR));

while (time_before(jiffies, exp_time)) {
if (icosq->cc == icosq->pc)
Expand Down
7 changes: 5 additions & 2 deletions drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
#include "health.h"
#include "en/ptp.h"
#include "en/devlink.h"
#include "lib/tout.h"

static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
{
unsigned long exp_time = jiffies +
msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
struct mlx5_core_dev *dev = sq->mdev;
unsigned long exp_time;

exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR));

while (time_before(jiffies, exp_time)) {
if (sq->cc == sq->pc)
Expand Down
9 changes: 5 additions & 4 deletions drivers/net/ethernet/mellanox/mlx5/core/fw.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <linux/module.h>
#include "mlx5_core.h"
#include "../../mlxfw/mlxfw.h"
#include "lib/tout.h"
#include "accel/tls.h"

enum {
Expand Down Expand Up @@ -317,10 +318,9 @@ int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev)
return 0;
}

#define MLX5_FAST_TEARDOWN_WAIT_MS 3000
int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev)
{
unsigned long end, delay_ms = MLX5_FAST_TEARDOWN_WAIT_MS;
unsigned long end, delay_ms = mlx5_tout_ms(dev, TEARDOWN);
u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {};
u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {};
int state;
Expand Down Expand Up @@ -618,17 +618,18 @@ static void mlx5_fsm_release(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
fwhandle, 0);
}

#define MLX5_FSM_REACTIVATE_TOUT 5000 /* msecs */
static int mlx5_fsm_reactivate(struct mlxfw_dev *mlxfw_dev, u8 *status)
{
unsigned long exp_time = jiffies + msecs_to_jiffies(MLX5_FSM_REACTIVATE_TOUT);
struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
u32 out[MLX5_ST_SZ_DW(mirc_reg)];
u32 in[MLX5_ST_SZ_DW(mirc_reg)];
unsigned long exp_time;
int err;

exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FSM_REACTIVATE));

if (!MLX5_CAP_MCAM_REG2(dev, mirc))
return -EOPNOTSUPP;

Expand Down
16 changes: 7 additions & 9 deletions drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "fw_reset.h"
#include "diag/fw_tracer.h"
#include "lib/tout.h"

enum {
MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
Expand Down Expand Up @@ -228,8 +229,6 @@ static void mlx5_sync_reset_request_event(struct work_struct *work)
mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n");
}

#define MLX5_PCI_LINK_UP_TIMEOUT 2000

static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev)
{
struct pci_bus *bridge_bus = dev->pdev->bus;
Expand Down Expand Up @@ -286,7 +285,7 @@ static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev)
goto restore;
}

timeout = jiffies + msecs_to_jiffies(MLX5_PCI_LINK_UP_TIMEOUT);
timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE));
do {
err = pci_read_config_word(bridge, cap + PCI_EXP_LNKSTA, &reg16);
if (err)
Expand All @@ -299,8 +298,8 @@ static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev)
if (reg16 & PCI_EXP_LNKSTA_DLLLA) {
mlx5_core_info(dev, "PCI Link up\n");
} else {
mlx5_core_err(dev, "PCI link not ready (0x%04x) after %d ms\n",
reg16, MLX5_PCI_LINK_UP_TIMEOUT);
mlx5_core_err(dev, "PCI link not ready (0x%04x) after %llu ms\n",
reg16, mlx5_tout_ms(dev, PCI_TOGGLE));
err = -ETIMEDOUT;
}

Expand Down Expand Up @@ -395,16 +394,15 @@ static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long acti
return NOTIFY_OK;
}

#define MLX5_FW_RESET_TIMEOUT_MSEC 5000
int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev)
{
unsigned long timeout = msecs_to_jiffies(MLX5_FW_RESET_TIMEOUT_MSEC);
unsigned long timeout = msecs_to_jiffies(mlx5_tout_ms(dev, PCI_SYNC_UPDATE));
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
int err;

if (!wait_for_completion_timeout(&fw_reset->done, timeout)) {
mlx5_core_warn(dev, "FW sync reset timeout after %d seconds\n",
MLX5_FW_RESET_TIMEOUT_MSEC / 1000);
mlx5_core_warn(dev, "FW sync reset timeout after %llu seconds\n",
mlx5_tout_ms(dev, PCI_SYNC_UPDATE) / 1000);
err = -ETIMEDOUT;
goto out;
}
Expand Down
21 changes: 9 additions & 12 deletions drivers/net/ethernet/mellanox/mlx5/core/health.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@
#include "lib/eq.h"
#include "lib/mlx5.h"
#include "lib/pci_vsc.h"
#include "lib/tout.h"
#include "diag/fw_tracer.h"

enum {
MLX5_HEALTH_POLL_INTERVAL = 2 * HZ,
MAX_MISSES = 3,
};

Expand Down Expand Up @@ -219,11 +219,9 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
mutex_unlock(&dev->intf_state_mutex);
}

#define MLX5_CRDUMP_WAIT_MS 60000
#define MLX5_FW_RESET_WAIT_MS 1000
void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
{
unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS;
unsigned long end, delay_ms = mlx5_tout_ms(dev, PCI_TOGGLE);
int lock = -EBUSY;

mutex_lock(&dev->intf_state_mutex);
Expand All @@ -237,7 +235,7 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
lock = lock_sem_sw_reset(dev, true);

if (lock == -EBUSY) {
delay_ms = MLX5_CRDUMP_WAIT_MS;
delay_ms = mlx5_tout_ms(dev, FULL_CRDUMP);
goto recover_from_sw_reset;
}
/* Execute SW reset */
Expand Down Expand Up @@ -307,13 +305,11 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
mlx5_disable_device(dev);
}

/* How much time to wait until health resetting the driver (in msecs) */
#define MLX5_RECOVERY_WAIT_MSECS 60000
int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev)
{
unsigned long end;

end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
end = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FW_RESET));
while (sensor_pci_not_working(dev)) {
if (time_after(jiffies, end))
return -ETIMEDOUT;
Expand Down Expand Up @@ -674,13 +670,13 @@ static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
devlink_health_reporter_destroy(health->fw_fatal_reporter);
}

static unsigned long get_next_poll_jiffies(void)
static unsigned long get_next_poll_jiffies(struct mlx5_core_dev *dev)
{
unsigned long next;

get_random_bytes(&next, sizeof(next));
next %= HZ;
next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
next += jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL));

return next;
}
Expand Down Expand Up @@ -740,11 +736,12 @@ static void poll_health(struct timer_list *t)
queue_work(health->wq, &health->report_work);

out:
mod_timer(&health->timer, get_next_poll_jiffies());
mod_timer(&health->timer, get_next_poll_jiffies(dev));
}

void mlx5_start_health_poll(struct mlx5_core_dev *dev)
{
u64 poll_interval_ms = mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL);
struct mlx5_core_health *health = &dev->priv.health;

timer_setup(&health->timer, poll_health, 0);
Expand All @@ -753,7 +750,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
health->health = &dev->iseg->health;
health->health_counter = &dev->iseg->health_counter;

health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms);
add_timer(&health->timer);
}

Expand Down
68 changes: 67 additions & 1 deletion drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,17 @@ static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
[MLX5_TO_FW_INIT_MS] = 2000,
[MLX5_TO_CMD_MS] = 60000
[MLX5_TO_CMD_MS] = 60000,
[MLX5_TO_PCI_TOGGLE_MS] = 2000,
[MLX5_TO_HEALTH_POLL_INTERVAL_MS] = 2000,
[MLX5_TO_FULL_CRDUMP_MS] = 60000,
[MLX5_TO_FW_RESET_MS] = 60000,
[MLX5_TO_FLUSH_ON_ERROR_MS] = 2000,
[MLX5_TO_PCI_SYNC_UPDATE_MS] = 5000,
[MLX5_TO_TEARDOWN_MS] = 3000,
[MLX5_TO_FSM_REACTIVATE_MS] = 5000,
[MLX5_TO_RECLAIM_PAGES_MS] = 5000,
[MLX5_TO_RECLAIM_VFS_PAGES_MS] = 120000
};

static void tout_set(struct mlx5_core_dev *dev, u64 val, enum mlx5_timeouts_types type)
Expand Down Expand Up @@ -94,3 +104,59 @@ u64 _mlx5_tout_ms(struct mlx5_core_dev *dev, enum mlx5_timeouts_types type)
{
return dev->timeouts->to[type];
}

#define MLX5_TIMEOUT_QUERY(fld, reg_out) \
({ \
struct mlx5_ifc_default_timeout_bits *time_field; \
u32 to_multi, to_value; \
u64 to_val_ms; \
\
time_field = MLX5_ADDR_OF(dtor_reg, reg_out, fld); \
to_multi = MLX5_GET(default_timeout, time_field, to_multiplier); \
to_value = MLX5_GET(default_timeout, time_field, to_value); \
to_val_ms = tout_convert_reg_field_to_ms(to_multi, to_value); \
to_val_ms; \
})

#define MLX5_TIMEOUT_FILL(fld, reg_out, dev, to_type, to_extra) \
({ \
u64 fw_to = MLX5_TIMEOUT_QUERY(fld, reg_out); \
tout_set(dev, fw_to + (to_extra), to_type); \
fw_to; \
})

static int tout_query_dtor(struct mlx5_core_dev *dev)
{
u64 pcie_toggle_to_val, tear_down_to_val;
u32 out[MLX5_ST_SZ_DW(dtor_reg)] = {};
u32 in[MLX5_ST_SZ_DW(dtor_reg)] = {};
int err;

err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_DTOR, 0, 0);
if (err)
return err;

pcie_toggle_to_val = MLX5_TIMEOUT_FILL(pcie_toggle_to, out, dev, MLX5_TO_PCI_TOGGLE_MS, 0);
MLX5_TIMEOUT_FILL(fw_reset_to, out, dev, MLX5_TO_FW_RESET_MS, pcie_toggle_to_val);

tear_down_to_val = MLX5_TIMEOUT_FILL(tear_down_to, out, dev, MLX5_TO_TEARDOWN_MS, 0);
MLX5_TIMEOUT_FILL(pci_sync_update_to, out, dev, MLX5_TO_PCI_SYNC_UPDATE_MS,
tear_down_to_val);

MLX5_TIMEOUT_FILL(health_poll_to, out, dev, MLX5_TO_HEALTH_POLL_INTERVAL_MS, 0);
MLX5_TIMEOUT_FILL(full_crdump_to, out, dev, MLX5_TO_FULL_CRDUMP_MS, 0);
MLX5_TIMEOUT_FILL(flush_on_err_to, out, dev, MLX5_TO_FLUSH_ON_ERROR_MS, 0);
MLX5_TIMEOUT_FILL(fsm_reactivate_to, out, dev, MLX5_TO_FSM_REACTIVATE_MS, 0);
MLX5_TIMEOUT_FILL(reclaim_pages_to, out, dev, MLX5_TO_RECLAIM_PAGES_MS, 0);
MLX5_TIMEOUT_FILL(reclaim_vfs_pages_to, out, dev, MLX5_TO_RECLAIM_VFS_PAGES_MS, 0);

return 0;
}

int mlx5_tout_query_dtor(struct mlx5_core_dev *dev)
{
if (tout_is_supported(dev))
return tout_query_dtor(dev);

return 0;
}
13 changes: 13 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,26 @@ enum mlx5_timeouts_types {
MLX5_TO_FW_INIT_MS,
MLX5_TO_CMD_MS,

/* DTOR timeouts */
MLX5_TO_PCI_TOGGLE_MS,
MLX5_TO_HEALTH_POLL_INTERVAL_MS,
MLX5_TO_FULL_CRDUMP_MS,
MLX5_TO_FW_RESET_MS,
MLX5_TO_FLUSH_ON_ERROR_MS,
MLX5_TO_PCI_SYNC_UPDATE_MS,
MLX5_TO_TEARDOWN_MS,
MLX5_TO_FSM_REACTIVATE_MS,
MLX5_TO_RECLAIM_PAGES_MS,
MLX5_TO_RECLAIM_VFS_PAGES_MS,

MAX_TIMEOUT_TYPES
};

struct mlx5_core_dev;
int mlx5_tout_init(struct mlx5_core_dev *dev);
void mlx5_tout_cleanup(struct mlx5_core_dev *dev);
void mlx5_tout_query_iseg(struct mlx5_core_dev *dev);
int mlx5_tout_query_dtor(struct mlx5_core_dev *dev);
u64 _mlx5_tout_ms(struct mlx5_core_dev *dev, enum mlx5_timeouts_types type);

#define mlx5_tout_ms(dev, type) _mlx5_tout_ms(dev, MLX5_TO_##type##_MS)
Expand Down
6 changes: 6 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,12 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
goto err_disable_hca;
}

err = mlx5_tout_query_dtor(dev);
if (err) {
mlx5_core_err(dev, "failed to read dtor\n");
goto reclaim_boot_pages;
}

err = set_hca_ctrl(dev);
if (err) {
mlx5_core_err(dev, "set_hca_ctrl failed\n");
Expand Down
Loading

0 comments on commit 32def41

Please sign in to comment.