Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 368195
b: refs/heads/master
c: 626950d
h: refs/heads/master
i:
  368193: 44a7d3d
  368191: e432360
v: v3
  • Loading branch information
Alexandre Rames authored and Ben Hutchings committed Mar 7, 2013
1 parent 1063ba0 commit 0bba801
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 27 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 634ab72c39b3df78ac7d6ccd4a50133059bae14f
refs/heads/master: 626950db84c065925ee10c2e833da265cbda8800
207 changes: 185 additions & 22 deletions trunk/drivers/net/ethernet/sfc/efx.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
#include <linux/ethtool.h>
#include <linux/topology.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/cpu_rmap.h>
#include <linux/aer.h>
#include "net_driver.h"
#include "efx.h"
#include "nic.h"
Expand Down Expand Up @@ -71,17 +73,19 @@ const char *const efx_loopback_mode_names[] = {

const unsigned int efx_reset_type_max = RESET_TYPE_MAX;
const char *const efx_reset_type_names[] = {
[RESET_TYPE_INVISIBLE] = "INVISIBLE",
[RESET_TYPE_ALL] = "ALL",
[RESET_TYPE_WORLD] = "WORLD",
[RESET_TYPE_DISABLE] = "DISABLE",
[RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG",
[RESET_TYPE_INT_ERROR] = "INT_ERROR",
[RESET_TYPE_RX_RECOVERY] = "RX_RECOVERY",
[RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH",
[RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH",
[RESET_TYPE_TX_SKIP] = "TX_SKIP",
[RESET_TYPE_MC_FAILURE] = "MC_FAILURE",
[RESET_TYPE_INVISIBLE] = "INVISIBLE",
[RESET_TYPE_ALL] = "ALL",
[RESET_TYPE_RECOVER_OR_ALL] = "RECOVER_OR_ALL",
[RESET_TYPE_WORLD] = "WORLD",
[RESET_TYPE_RECOVER_OR_DISABLE] = "RECOVER_OR_DISABLE",
[RESET_TYPE_DISABLE] = "DISABLE",
[RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG",
[RESET_TYPE_INT_ERROR] = "INT_ERROR",
[RESET_TYPE_RX_RECOVERY] = "RX_RECOVERY",
[RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH",
[RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH",
[RESET_TYPE_TX_SKIP] = "TX_SKIP",
[RESET_TYPE_MC_FAILURE] = "MC_FAILURE",
};

#define EFX_MAX_MTU (9 * 1024)
Expand Down Expand Up @@ -117,9 +121,12 @@ MODULE_PARM_DESC(separate_tx_channels,
static int napi_weight = 64;

/* This is the time (in jiffies) between invocations of the hardware
* monitor. On Falcon-based NICs, this will:
* monitor.
* On Falcon-based NICs, this will:
* - Check the on-board hardware monitor;
* - Poll the link state and reconfigure the hardware as necessary.
* On Siena-based NICs for power systems with EEH support, this will give EEH a
* chance to start.
*/
static unsigned int efx_monitor_interval = 1 * HZ;

Expand Down Expand Up @@ -203,13 +210,14 @@ static void efx_stop_all(struct efx_nic *efx);
#define EFX_ASSERT_RESET_SERIALISED(efx) \
do { \
if ((efx->state == STATE_READY) || \
(efx->state == STATE_RECOVERY) || \
(efx->state == STATE_DISABLED)) \
ASSERT_RTNL(); \
} while (0)

static int efx_check_disabled(struct efx_nic *efx)
{
if (efx->state == STATE_DISABLED) {
if (efx->state == STATE_DISABLED || efx->state == STATE_RECOVERY) {
netif_err(efx, drv, efx->net_dev,
"device is disabled due to earlier errors\n");
return -EIO;
Expand Down Expand Up @@ -677,7 +685,7 @@ static void efx_stop_datapath(struct efx_nic *efx)
BUG_ON(efx->port_enabled);

/* Only perform flush if dma is enabled */
if (dev->is_busmaster) {
if (dev->is_busmaster && efx->state != STATE_RECOVERY) {
rc = efx_nic_flush_queues(efx);

if (rc && EFX_WORKAROUND_7803(efx)) {
Expand Down Expand Up @@ -1590,13 +1598,15 @@ static void efx_start_all(struct efx_nic *efx)
efx_start_port(efx);
efx_start_datapath(efx);

/* Start the hardware monitor if there is one. Otherwise (we're link
* event driven), we have to poll the PHY because after an event queue
* flush, we could have a missed a link state change */
if (efx->type->monitor != NULL) {
/* Start the hardware monitor if there is one */
if (efx->type->monitor != NULL)
queue_delayed_work(efx->workqueue, &efx->monitor_work,
efx_monitor_interval);
} else {

/* If link state detection is normally event-driven, we have
* to poll now because we could have missed a change
*/
if (efx_nic_rev(efx) >= EFX_REV_SIENA_A0) {
mutex_lock(&efx->mac_lock);
if (efx->phy_op->poll(efx))
efx_link_status_changed(efx);
Expand Down Expand Up @@ -2303,7 +2313,9 @@ int efx_reset(struct efx_nic *efx, enum reset_type method)

out:
/* Leave device stopped if necessary */
disabled = rc || method == RESET_TYPE_DISABLE;
disabled = rc ||
method == RESET_TYPE_DISABLE ||
method == RESET_TYPE_RECOVER_OR_DISABLE;
rc2 = efx_reset_up(efx, method, !disabled);
if (rc2) {
disabled = true;
Expand All @@ -2322,13 +2334,48 @@ int efx_reset(struct efx_nic *efx, enum reset_type method)
return rc;
}

/* Try recovery mechanisms.
* For now only EEH is supported.
* Returns 0 if the recovery mechanisms are unsuccessful.
* Returns a non-zero value otherwise.
*/
static int efx_try_recovery(struct efx_nic *efx)
{
#ifdef CONFIG_EEH
/* A PCI error can occur and not be seen by EEH because nothing
* happens on the PCI bus. In this case the driver may fail and
* schedule a 'recover or reset', leading to this recovery handler.
* Manually call the eeh failure check function.
*/
struct eeh_dev *eehdev =
of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev));

if (eeh_dev_check_failure(eehdev)) {
/* The EEH mechanisms will handle the error and reset the
* device if necessary.
*/
return 1;
}
#endif
return 0;
}

/* The worker thread exists so that code that cannot sleep can
* schedule a reset for later.
*/
static void efx_reset_work(struct work_struct *data)
{
struct efx_nic *efx = container_of(data, struct efx_nic, reset_work);
unsigned long pending = ACCESS_ONCE(efx->reset_pending);
unsigned long pending;
enum reset_type method;

pending = ACCESS_ONCE(efx->reset_pending);
method = fls(pending) - 1;

if ((method == RESET_TYPE_RECOVER_OR_DISABLE ||
method == RESET_TYPE_RECOVER_OR_ALL) &&
efx_try_recovery(efx))
return;

if (!pending)
return;
Expand All @@ -2340,7 +2387,7 @@ static void efx_reset_work(struct work_struct *data)
* it cannot change again.
*/
if (efx->state == STATE_READY)
(void)efx_reset(efx, fls(pending) - 1);
(void)efx_reset(efx, method);

rtnl_unlock();
}
Expand All @@ -2349,11 +2396,20 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)
{
enum reset_type method;

if (efx->state == STATE_RECOVERY) {
netif_dbg(efx, drv, efx->net_dev,
"recovering: skip scheduling %s reset\n",
RESET_TYPE(type));
return;
}

switch (type) {
case RESET_TYPE_INVISIBLE:
case RESET_TYPE_ALL:
case RESET_TYPE_RECOVER_OR_ALL:
case RESET_TYPE_WORLD:
case RESET_TYPE_DISABLE:
case RESET_TYPE_RECOVER_OR_DISABLE:
method = type;
netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n",
RESET_TYPE(method));
Expand Down Expand Up @@ -2563,6 +2619,8 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
efx_fini_struct(efx);
pci_set_drvdata(pci_dev, NULL);
free_netdev(efx->net_dev);

pci_disable_pcie_error_reporting(pci_dev);
};

/* NIC VPD information
Expand Down Expand Up @@ -2735,6 +2793,11 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
netif_warn(efx, probe, efx->net_dev,
"failed to create MTDs (%d)\n", rc);

rc = pci_enable_pcie_error_reporting(pci_dev);
if (rc && rc != -EINVAL)
netif_warn(efx, probe, efx->net_dev,
"pci_enable_pcie_error_reporting failed (%d)\n", rc);

return 0;

fail4:
Expand Down Expand Up @@ -2859,12 +2922,112 @@ static const struct dev_pm_ops efx_pm_ops = {
.restore = efx_pm_resume,
};

/* A PCI error affecting this device was detected.
* At this point MMIO and DMA may be disabled.
* Stop the software path and request a slot reset.
*/
pci_ers_result_t efx_io_error_detected(struct pci_dev *pdev,
enum pci_channel_state state)
{
pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
struct efx_nic *efx = pci_get_drvdata(pdev);

if (state == pci_channel_io_perm_failure)
return PCI_ERS_RESULT_DISCONNECT;

rtnl_lock();

if (efx->state != STATE_DISABLED) {
efx->state = STATE_RECOVERY;
efx->reset_pending = 0;

efx_device_detach_sync(efx);

efx_stop_all(efx);
efx_stop_interrupts(efx, false);

status = PCI_ERS_RESULT_NEED_RESET;
} else {
/* If the interface is disabled we don't want to do anything
* with it.
*/
status = PCI_ERS_RESULT_RECOVERED;
}

rtnl_unlock();

pci_disable_device(pdev);

return status;
}

/* Fake a successfull reset, which will be performed later in efx_io_resume. */
pci_ers_result_t efx_io_slot_reset(struct pci_dev *pdev)
{
struct efx_nic *efx = pci_get_drvdata(pdev);
pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
int rc;

if (pci_enable_device(pdev)) {
netif_err(efx, hw, efx->net_dev,
"Cannot re-enable PCI device after reset.\n");
status = PCI_ERS_RESULT_DISCONNECT;
}

rc = pci_cleanup_aer_uncorrect_error_status(pdev);
if (rc) {
netif_err(efx, hw, efx->net_dev,
"pci_cleanup_aer_uncorrect_error_status failed (%d)\n", rc);
/* Non-fatal error. Continue. */
}

return status;
}

/* Perform the actual reset and resume I/O operations. */
static void efx_io_resume(struct pci_dev *pdev)
{
struct efx_nic *efx = pci_get_drvdata(pdev);
int rc;

rtnl_lock();

if (efx->state == STATE_DISABLED)
goto out;

rc = efx_reset(efx, RESET_TYPE_ALL);
if (rc) {
netif_err(efx, hw, efx->net_dev,
"efx_reset failed after PCI error (%d)\n", rc);
} else {
efx->state = STATE_READY;
netif_dbg(efx, hw, efx->net_dev,
"Done resetting and resuming IO after PCI error.\n");
}

out:
rtnl_unlock();
}

/* For simplicity and reliability, we always require a slot reset and try to
* reset the hardware when a pci error affecting the device is detected.
* We leave both the link_reset and mmio_enabled callback unimplemented:
* with our request for slot reset the mmio_enabled callback will never be
* called, and the link_reset callback is not used by AER or EEH mechanisms.
*/
static struct pci_error_handlers efx_err_handlers = {
.error_detected = efx_io_error_detected,
.slot_reset = efx_io_slot_reset,
.resume = efx_io_resume,
};

static struct pci_driver efx_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = efx_pci_table,
.probe = efx_pci_probe,
.remove = efx_pci_remove,
.driver.pm = &efx_pm_ops,
.err_handler = &efx_err_handlers,
};

/**************************************************************************
Expand Down
12 changes: 9 additions & 3 deletions trunk/drivers/net/ethernet/sfc/enum.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,12 @@ enum efx_loopback_mode {
* Reset methods are numbered in order of increasing scope.
*
* @RESET_TYPE_INVISIBLE: Reset datapath and MAC (Falcon only)
* @RESET_TYPE_RECOVER_OR_ALL: Try to recover. Apply RESET_TYPE_ALL
* if unsuccessful.
* @RESET_TYPE_ALL: Reset datapath, MAC and PHY
* @RESET_TYPE_WORLD: Reset as much as possible
* @RESET_TYPE_RECOVER_OR_DISABLE: Try to recover. Apply RESET_TYPE_DISABLE if
* unsuccessful.
* @RESET_TYPE_DISABLE: Reset datapath, MAC and PHY; leave NIC disabled
* @RESET_TYPE_TX_WATCHDOG: reset due to TX watchdog
* @RESET_TYPE_INT_ERROR: reset due to internal error
Expand All @@ -150,9 +154,11 @@ enum efx_loopback_mode {
*/
enum reset_type {
RESET_TYPE_INVISIBLE = 0,
RESET_TYPE_ALL = 1,
RESET_TYPE_WORLD = 2,
RESET_TYPE_DISABLE = 3,
RESET_TYPE_RECOVER_OR_ALL = 1,
RESET_TYPE_ALL = 2,
RESET_TYPE_WORLD = 3,
RESET_TYPE_RECOVER_OR_DISABLE = 4,
RESET_TYPE_DISABLE = 5,
RESET_TYPE_MAX_METHOD,
RESET_TYPE_TX_WATCHDOG,
RESET_TYPE_INT_ERROR,
Expand Down
1 change: 1 addition & 0 deletions trunk/drivers/net/ethernet/sfc/net_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ enum nic_state {
STATE_UNINIT = 0, /* device being probed/removed or is frozen */
STATE_READY = 1, /* hardware ready and netdev registered */
STATE_DISABLED = 2, /* device disabled due to hardware errors */
STATE_RECOVERY = 3, /* device recovering from PCI error */
};

/*
Expand Down
Loading

0 comments on commit 0bba801

Please sign in to comment.