diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h index e4408473e802b..7202084198bdb 100644 --- a/drivers/gpu/drm/xe/regs/xe_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_regs.h @@ -70,8 +70,15 @@ #define SOFTWARE_FLAGS_SPR33 XE_REG(0x4f084) +#define GU_CNTL_PROTECTED XE_REG(0x10100C) +#define DRIVERINT_FLR_DIS REG_BIT(31) + #define GU_CNTL XE_REG(0x101010) #define LMEM_INIT REG_BIT(7) +#define DRIVERFLR REG_BIT(31) + +#define GU_DEBUG XE_REG(0x101018) +#define DRIVERFLR_STATUS REG_BIT(31) #define XEHP_CLOCK_GATE_DIS XE_REG(0x101014) #define SGSI_SIDECLK_DIS REG_BIT(17) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index ae0b7349c3e31..5869ba7e0cdc1 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -5,6 +5,8 @@ #include "xe_device.h" +#include + #include #include #include @@ -252,6 +254,78 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, return ERR_PTR(err); } +/* + * The driver-initiated FLR is the highest level of reset that we can trigger + * from within the driver. It is different from the PCI FLR in that it doesn't + * fully reset the SGUnit and doesn't modify the PCI config space and therefore + * it doesn't require a re-enumeration of the PCI BARs. However, the + * driver-initiated FLR does still cause a reset of both GT and display and a + * memory wipe of local and stolen memory, so recovery would require a full HW + * re-init and saving/restoring (or re-populating) the wiped memory. Since we + * perform the FLR as the very last action before releasing access to the HW + * during the driver release flow, we don't attempt recovery at all, because + * if/when a new instance of i915 is bound to the device it will do a full + * re-init anyway. + */ +static void xe_driver_flr(struct xe_device *xe) +{ + const unsigned int flr_timeout = 3 * MICRO; /* specs recommend a 3s wait */ + struct xe_gt *gt = xe_root_mmio_gt(xe); + int ret; + + if (xe_mmio_read32(gt, GU_CNTL_PROTECTED) & DRIVERINT_FLR_DIS) { + drm_info_once(&xe->drm, "BIOS Disabled Driver-FLR\n"); + return; + } + + drm_dbg(&xe->drm, "Triggering Driver-FLR\n"); + + /* + * Make sure any pending FLR requests have cleared by waiting for the + * FLR trigger bit to go to zero. Also clear GU_DEBUG's DRIVERFLR_STATUS + * to make sure it's not still set from a prior attempt (it's a write to + * clear bit). + * Note that we should never be in a situation where a previous attempt + * is still pending (unless the HW is totally dead), but better to be + * safe in case something unexpected happens + */ + ret = xe_mmio_wait32(gt, GU_CNTL, DRIVERFLR, 0, flr_timeout, NULL, false); + if (ret) { + drm_err(&xe->drm, "Driver-FLR-prepare wait for ready failed! %d\n", ret); + return; + } + xe_mmio_write32(gt, GU_DEBUG, DRIVERFLR_STATUS); + + /* Trigger the actual Driver-FLR */ + xe_mmio_rmw32(gt, GU_CNTL, 0, DRIVERFLR); + + /* Wait for hardware teardown to complete */ + ret = xe_mmio_wait32(gt, GU_CNTL, DRIVERFLR, 0, flr_timeout, NULL, false); + if (ret) { + drm_err(&xe->drm, "Driver-FLR-teardown wait completion failed! %d\n", ret); + return; + } + + /* Wait for hardware/firmware re-init to complete */ + ret = xe_mmio_wait32(gt, GU_DEBUG, DRIVERFLR_STATUS, DRIVERFLR_STATUS, + flr_timeout, NULL, false); + if (ret) { + drm_err(&xe->drm, "Driver-FLR-reinit wait completion failed! %d\n", ret); + return; + } + + /* Clear sticky completion status */ + xe_mmio_write32(gt, GU_DEBUG, DRIVERFLR_STATUS); +} + +static void xe_driver_flr_fini(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + + if (xe->needs_flr_on_fini) + xe_driver_flr(xe); +} + static void xe_device_sanitize(struct drm_device *drm, void *arg) { struct xe_device *xe = arg; @@ -283,6 +357,10 @@ int xe_device_probe(struct xe_device *xe) if (err) return err; + err = drmm_add_action_or_reset(&xe->drm, xe_driver_flr_fini, xe); + if (err) + return err; + for_each_gt(gt, xe, id) { err = xe_pcode_probe(gt); if (err) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 4bc668ff8615f..4425c2484a02b 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -388,6 +388,9 @@ struct xe_device { /** @heci_gsc: graphics security controller */ struct xe_heci_gsc heci_gsc; + /** @needs_flr_on_fini: requests function-reset on fini */ + bool needs_flr_on_fini; + /* For pcode */ struct mutex sb_lock; diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index d380f67b33659..73c090762771b 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -626,6 +626,8 @@ static int gt_reset(struct xe_gt *gt) xe_uevent_gt_reset_failure(to_pci_dev(gt_to_xe(gt)->drm.dev), gt_to_tile(gt)->id, gt->info.id); + gt_to_xe(gt)->needs_flr_on_fini = true; + return err; }