From f7b5bd725b737de3f2c4a836e07c82ba156d75df Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 14 Sep 2023 15:31:57 -0700 Subject: [PATCH 1/4] pds_core: check health in devcmd wait Similar to what we do in the AdminQ, check for devcmd health while waiting for an answer. Signed-off-by: Shannon Nelson Reviewed-by: Brett Creeley Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/dev.c | 11 +++++++++-- include/linux/pds/pds_core_if.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c index f77cd9f5a2fda..7c1b965d61a92 100644 --- a/drivers/net/ethernet/amd/pds_core/dev.c +++ b/drivers/net/ethernet/amd/pds_core/dev.c @@ -42,6 +42,8 @@ int pdsc_err_to_errno(enum pds_core_status_code code) return -ERANGE; case PDS_RC_BAD_ADDR: return -EFAULT; + case PDS_RC_BAD_PCI: + return -ENXIO; case PDS_RC_EOPCODE: case PDS_RC_EINTR: case PDS_RC_DEV_CMD: @@ -62,7 +64,7 @@ bool pdsc_is_fw_running(struct pdsc *pdsc) /* Firmware is useful only if the running bit is set and * fw_status != 0xff (bad PCI read) */ - return (pdsc->fw_status != 0xff) && + return (pdsc->fw_status != PDS_RC_BAD_PCI) && (pdsc->fw_status & PDS_CORE_FW_STS_F_RUNNING); } @@ -128,6 +130,7 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds) unsigned long max_wait; unsigned long duration; int timeout = 0; + bool running; int done = 0; int err = 0; int status; @@ -136,6 +139,10 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds) max_wait = start_time + (max_seconds * HZ); while (!done && !timeout) { + running = pdsc_is_fw_running(pdsc); + if (!running) + break; + done = pdsc_devcmd_done(pdsc); if (done) break; @@ -152,7 +159,7 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds) dev_dbg(dev, "DEVCMD %d %s after %ld secs\n", opcode, pdsc_devcmd_str(opcode), duration / HZ); - if (!done || timeout) { + if ((!done || timeout) && running) { dev_err(dev, "DEVCMD %d %s timeout, done %d timeout %d max_seconds=%d\n", opcode, pdsc_devcmd_str(opcode), done, timeout, max_seconds); diff --git a/include/linux/pds/pds_core_if.h b/include/linux/pds/pds_core_if.h index e838a2b90440c..17a87c1a55d7c 100644 --- a/include/linux/pds/pds_core_if.h +++ b/include/linux/pds/pds_core_if.h @@ -79,6 +79,7 @@ enum pds_core_status_code { PDS_RC_EVFID = 31, /* VF ID does not exist */ PDS_RC_BAD_FW = 32, /* FW file is invalid or corrupted */ PDS_RC_ECLIENT = 33, /* No such client id */ + PDS_RC_BAD_PCI = 255, /* Broken PCI when reading status */ }; /** From d557c094e7400929bbcd9df6500af0d5e3ab08c1 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 14 Sep 2023 15:31:58 -0700 Subject: [PATCH 2/4] pds_core: keep viftypes table across reset Keep the viftypes and the current enable/disable states across a recovery action. Signed-off-by: Shannon Nelson Reviewed-by: Brett Creeley Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 36f9b932b9e2a..6e426202ab835 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -445,12 +445,13 @@ int pdsc_setup(struct pdsc *pdsc, bool init) goto err_out_teardown; /* Set up the VIFs */ - err = pdsc_viftypes_init(pdsc); - if (err) - goto err_out_teardown; + if (init) { + err = pdsc_viftypes_init(pdsc); + if (err) + goto err_out_teardown; - if (init) pdsc_debugfs_add_viftype(pdsc); + } clear_bit(PDSC_S_FW_DEAD, &pdsc->state); return 0; @@ -469,8 +470,10 @@ void pdsc_teardown(struct pdsc *pdsc, bool removing) pdsc_qcq_free(pdsc, &pdsc->notifyqcq); pdsc_qcq_free(pdsc, &pdsc->adminqcq); - kfree(pdsc->viftype_status); - pdsc->viftype_status = NULL; + if (removing) { + kfree(pdsc->viftype_status); + pdsc->viftype_status = NULL; + } if (pdsc->intr_info) { for (i = 0; i < pdsc->nintrs; i++) From ffa55858330f267beec995fc4f68098c91311c64 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 14 Sep 2023 15:31:59 -0700 Subject: [PATCH 3/4] pds_core: implement pci reset handlers Implement the callbacks for a nice PCI reset. These get called when a user is nice enough to use the sysfs PCI reset entry, e.g. echo 1 > /sys/bus/pci/devices/0000:2b:00.0/reset Signed-off-by: Shannon Nelson Reviewed-by: Brett Creeley Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/core.c | 14 +++++-- drivers/net/ethernet/amd/pds_core/core.h | 4 ++ drivers/net/ethernet/amd/pds_core/main.c | 50 ++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 6e426202ab835..c1b6b5f7c0b52 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -515,7 +515,7 @@ void pdsc_stop(struct pdsc *pdsc) PDS_CORE_INTR_MASK_SET); } -static void pdsc_fw_down(struct pdsc *pdsc) +void pdsc_fw_down(struct pdsc *pdsc) { union pds_core_notifyq_comp reset_event = { .reset.ecode = cpu_to_le16(PDS_EVENT_RESET), @@ -523,10 +523,13 @@ static void pdsc_fw_down(struct pdsc *pdsc) }; if (test_and_set_bit(PDSC_S_FW_DEAD, &pdsc->state)) { - dev_err(pdsc->dev, "%s: already happening\n", __func__); + dev_warn(pdsc->dev, "%s: already happening\n", __func__); return; } + if (pdsc->pdev->is_virtfn) + return; + /* Notify clients of fw_down */ if (pdsc->fw_reporter) devlink_health_report(pdsc->fw_reporter, "FW down reported", pdsc); @@ -536,7 +539,7 @@ static void pdsc_fw_down(struct pdsc *pdsc) pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); } -static void pdsc_fw_up(struct pdsc *pdsc) +void pdsc_fw_up(struct pdsc *pdsc) { union pds_core_notifyq_comp reset_event = { .reset.ecode = cpu_to_le16(PDS_EVENT_RESET), @@ -549,6 +552,11 @@ static void pdsc_fw_up(struct pdsc *pdsc) return; } + if (pdsc->pdev->is_virtfn) { + clear_bit(PDSC_S_FW_DEAD, &pdsc->state); + return; + } + err = pdsc_setup(pdsc, PDSC_SETUP_RECOVERY); if (err) goto err_out; diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index e545fafc48196..19c1957167da2 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -309,4 +309,8 @@ irqreturn_t pdsc_adminq_isr(int irq, void *data); int pdsc_firmware_update(struct pdsc *pdsc, const struct firmware *fw, struct netlink_ext_ack *extack); + +void pdsc_fw_down(struct pdsc *pdsc); +void pdsc_fw_up(struct pdsc *pdsc); + #endif /* _PDSC_H_ */ diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index 3a45bf474a19a..4c7f982c12a1d 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -445,12 +445,62 @@ static void pdsc_remove(struct pci_dev *pdev) devlink_free(dl); } +static void pdsc_reset_prepare(struct pci_dev *pdev) +{ + struct pdsc *pdsc = pci_get_drvdata(pdev); + + pdsc_fw_down(pdsc); + + pci_free_irq_vectors(pdev); + pdsc_unmap_bars(pdsc); + pci_release_regions(pdev); + pci_disable_device(pdev); +} + +static void pdsc_reset_done(struct pci_dev *pdev) +{ + struct pdsc *pdsc = pci_get_drvdata(pdev); + struct device *dev = pdsc->dev; + int err; + + err = pci_enable_device(pdev); + if (err) { + dev_err(dev, "Cannot enable PCI device: %pe\n", ERR_PTR(err)); + return; + } + pci_set_master(pdev); + + if (!pdev->is_virtfn) { + pcie_print_link_status(pdsc->pdev); + + err = pci_request_regions(pdsc->pdev, PDS_CORE_DRV_NAME); + if (err) { + dev_err(pdsc->dev, "Cannot request PCI regions: %pe\n", + ERR_PTR(err)); + return; + } + + err = pdsc_map_bars(pdsc); + if (err) + return; + } + + pdsc_fw_up(pdsc); +} + +static const struct pci_error_handlers pdsc_err_handler = { + /* FLR handling */ + .reset_prepare = pdsc_reset_prepare, + .reset_done = pdsc_reset_done, +}; + static struct pci_driver pdsc_driver = { .name = PDS_CORE_DRV_NAME, .id_table = pdsc_id_table, .probe = pdsc_probe, .remove = pdsc_remove, .sriov_configure = pdsc_sriov_configure, + .err_handler = &pdsc_err_handler, }; void *pdsc_get_pf_struct(struct pci_dev *vf_pdev) From 1e18ec3e9d46e4ad2b6507c3bfc7f59e2ab449a2 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 14 Sep 2023 15:32:00 -0700 Subject: [PATCH 4/4] pds_core: add attempts to fix broken PCI If we see a 0xff value from a PCI register read, we know that the PCI connection is broken, possibly by a low level reset that didn't go through the nice pci_error_handlers path. Make use of the PCI cleanup code that we already have from the reset handlers and add some detection and attempted recovery from a broken PCI connection. Signed-off-by: Shannon Nelson Reviewed-by: Brett Creeley Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/core.c | 14 ++++++++++++++ drivers/net/ethernet/amd/pds_core/core.h | 3 +++ drivers/net/ethernet/amd/pds_core/main.c | 4 ++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index c1b6b5f7c0b52..2a8643e167e16 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -578,6 +578,18 @@ void pdsc_fw_up(struct pdsc *pdsc) pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); } +static void pdsc_check_pci_health(struct pdsc *pdsc) +{ + u8 fw_status = ioread8(&pdsc->info_regs->fw_status); + + /* is PCI broken? */ + if (fw_status != PDS_RC_BAD_PCI) + return; + + pdsc_reset_prepare(pdsc->pdev); + pdsc_reset_done(pdsc->pdev); +} + void pdsc_health_thread(struct work_struct *work) { struct pdsc *pdsc = container_of(work, struct pdsc, health_work); @@ -604,6 +616,8 @@ void pdsc_health_thread(struct work_struct *work) pdsc_fw_down(pdsc); } + pdsc_check_pci_health(pdsc); + pdsc->fw_generation = pdsc->fw_status & PDS_CORE_FW_STS_F_GENERATION; out_unlock: diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index 19c1957167da2..f3a7deda99724 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -283,6 +283,9 @@ int pdsc_devcmd_reset(struct pdsc *pdsc); int pdsc_dev_reinit(struct pdsc *pdsc); int pdsc_dev_init(struct pdsc *pdsc); +void pdsc_reset_prepare(struct pci_dev *pdev); +void pdsc_reset_done(struct pci_dev *pdev); + int pdsc_intr_alloc(struct pdsc *pdsc, char *name, irq_handler_t handler, void *data); void pdsc_intr_free(struct pdsc *pdsc, int index); diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index 4c7f982c12a1d..3080898d7b95b 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -445,7 +445,7 @@ static void pdsc_remove(struct pci_dev *pdev) devlink_free(dl); } -static void pdsc_reset_prepare(struct pci_dev *pdev) +void pdsc_reset_prepare(struct pci_dev *pdev) { struct pdsc *pdsc = pci_get_drvdata(pdev); @@ -457,7 +457,7 @@ static void pdsc_reset_prepare(struct pci_dev *pdev) pci_disable_device(pdev); } -static void pdsc_reset_done(struct pci_dev *pdev) +void pdsc_reset_done(struct pci_dev *pdev) { struct pdsc *pdsc = pci_get_drvdata(pdev); struct device *dev = pdsc->dev;