From c8fb95e7a54315460b45090f0968167a332e1657 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Tue, 15 Oct 2024 16:12:07 +0000 Subject: [PATCH 1/5] drm/xe: Enlarge the invalidation timeout from 150 to 500 There are error messages like below that are occurring during stress testing: "[ 31.004009] xe 0000:03:00.0: [drm] ERROR GT0: Global invalidation timeout". Previously it was hitting this 3 out of 1000 executions of warm reboot. After raising it to 500, 1000 warm reboot executions passed and it didn't fail. Due to the way xe_mmio_wait32() is implemented, the timeout is able to expire early when the register matches the expected value due to the wait increments starting small. So, the larger timeout value should have no effect during normal use cases. v2 (Jonathan): - rework the commit message v3 (Lucas): - add conclusive message for the fail rate and test case v4: - add suggested-by Suggested-by: Jia Yao Signed-off-by: Shuicheng Lin Cc: Lucas De Marchi Cc: Matthew Auld Cc: Nirmoy Das Reviewed-by: Jonathan Cavitt Tested-by: Zongyao Bai Reviewed-by: Nirmoy Das Signed-off-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20241015161207.1373401-1-shuicheng.lin@intel.com (cherry picked from commit 2eb460ab9f4bc5b575f52568d17936da0af681d8) [ Fix conflict with gt->mmio ] Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 0a9ffc19e92f4..10fd4601b9f2a 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -890,7 +890,7 @@ void xe_device_l2_flush(struct xe_device *xe) spin_lock(>->global_invl_lock); xe_mmio_write32(gt, XE2_GLOBAL_INVAL, 0x1); - if (xe_mmio_wait32(gt, XE2_GLOBAL_INVAL, 0x1, 0x0, 150, NULL, true)) + if (xe_mmio_wait32(gt, XE2_GLOBAL_INVAL, 0x1, 0x0, 500, NULL, true)) xe_gt_err_once(gt, "Global invalidation timeout\n"); spin_unlock(>->global_invl_lock); From 22ef43c78647dd37b0dafe2182b8650b99dbbe59 Mon Sep 17 00:00:00 2001 From: Badal Nilawar Date: Thu, 17 Oct 2024 16:44:10 +0530 Subject: [PATCH 2/5] drm/xe/guc/ct: Flush g2h worker in case of g2h response timeout In case if g2h worker doesn't get opportunity to within specified timeout delay then flush the g2h worker explicitly. v2: - Describe change in the comment and add TODO (Matt B/John H) - Add xe_gt_warn on fence done after G2H flush (John H) v3: - Updated the comment with root cause - Clean up xe_gt_warn message (John H) Closes: https://gitlab.freedesktop.org/drm/xe/kernel/issues/1620 Closes: https://gitlab.freedesktop.org/drm/xe/kernel/issues/2902 Signed-off-by: Badal Nilawar Cc: Matthew Brost Cc: Matthew Auld Cc: John Harrison Cc: Himal Prasad Ghimiray Reviewed-by: Himal Prasad Ghimiray Acked-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241017111410.2553784-2-badal.nilawar@intel.com (cherry picked from commit e5152723380404acb8175e0777b1cea57f319a01) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc_ct.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index d16eb9ab49fbb..17986bfd88187 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -897,6 +897,24 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ); + /* + * Occasionally it is seen that the G2H worker starts running after a delay of more than + * a second even after being queued and activated by the Linux workqueue subsystem. This + * leads to G2H timeout error. The root cause of issue lies with scheduling latency of + * Lunarlake Hybrid CPU. Issue dissappears if we disable Lunarlake atom cores from BIOS + * and this is beyond xe kmd. + * + * TODO: Drop this change once workqueue scheduling delay issue is fixed on LNL Hybrid CPU. + */ + if (!ret) { + flush_work(&ct->g2h_worker); + if (g2h_fence.done) { + xe_gt_warn(gt, "G2H fence %u, action %04x, done\n", + g2h_fence.seqno, action[0]); + ret = 1; + } + } + /* * Ensure we serialize with completion side to prevent UAF with fence going out of scope on * the stack, since we have no clue if it will fire after the timeout before we can erase From 69418db678567bdf9a4992c83d448da462ffa78c Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Thu, 17 Oct 2024 22:15:47 +0000 Subject: [PATCH 3/5] drm/xe: Handle unreliable MMIO reads during forcewake In some cases, when the driver attempts to read an MMIO register, the hardware may return 0xFFFFFFFF. The current force wake path code treats this as a valid response, as it only checks the BIT. However, 0xFFFFFFFF should be considered an invalid value, indicating a potential issue. To address this, we should add a log entry to highlight this condition and return failure. The force wake failure log level is changed from notice to err to match the failure return value. v2 (Matt Brost): - set ret value (-EIO) to kick the error to upper layers v3 (Rodrigo): - add commit message for the log level promotion from notice to err v4: - update reviewed info Suggested-by: Alex Zuo Signed-off-by: Shuicheng Lin Cc: Matthew Brost Cc: Michal Wajdeczko Reviewed-by: Himal Prasad Ghimiray Acked-by: Badal Nilawar Cc: Anshuman Gupta Cc: Matt Roper Cc: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20241017221547.1564029-1-shuicheng.lin@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit a9fbeabe7226a3bf90f82d0e28a02c18e3c67447) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_force_wake.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_force_wake.c b/drivers/gpu/drm/xe/xe_force_wake.c index b263fff152737..7d9fc489dcb81 100644 --- a/drivers/gpu/drm/xe/xe_force_wake.c +++ b/drivers/gpu/drm/xe/xe_force_wake.c @@ -115,9 +115,15 @@ static int __domain_wait(struct xe_gt *gt, struct xe_force_wake_domain *domain, XE_FORCE_WAKE_ACK_TIMEOUT_MS * USEC_PER_MSEC, &value, true); if (ret) - xe_gt_notice(gt, "Force wake domain %d failed to ack %s (%pe) reg[%#x] = %#x\n", - domain->id, str_wake_sleep(wake), ERR_PTR(ret), - domain->reg_ack.addr, value); + xe_gt_err(gt, "Force wake domain %d failed to ack %s (%pe) reg[%#x] = %#x\n", + domain->id, str_wake_sleep(wake), ERR_PTR(ret), + domain->reg_ack.addr, value); + if (value == ~0) { + xe_gt_err(gt, + "Force wake domain %d: %s. MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n", + domain->id, str_wake_sleep(wake)); + ret = -EIO; + } return ret; } From 9c1813b3253480b30604c680026c7dc721ce86d1 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Wed, 16 Oct 2024 10:23:03 +0200 Subject: [PATCH 4/5] drm/xe/ufence: Prefetch ufence addr to catch bogus address access_ok() only checks for addr overflow so also try to read the addr to catch invalid addr sent from userspace. Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1630 Cc: Francois Dugast Cc: Maarten Lankhorst Cc: Matthew Auld Cc: Matthew Brost Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241016082304.66009-2-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit 9408c4508483ffc60811e910a93d6425b8e63928) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_sync.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index c6cf227ead40c..2e72c06fd40d0 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -54,8 +54,9 @@ static struct xe_user_fence *user_fence_create(struct xe_device *xe, u64 addr, { struct xe_user_fence *ufence; u64 __user *ptr = u64_to_user_ptr(addr); + u64 __maybe_unused prefetch_val; - if (!access_ok(ptr, sizeof(*ptr))) + if (get_user(prefetch_val, ptr)) return ERR_PTR(-EFAULT); ufence = kzalloc(sizeof(*ufence), GFP_KERNEL); From cdc21021f0351226a4845715564afd5dc50ed44b Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Tue, 22 Oct 2024 12:35:55 +0200 Subject: [PATCH 5/5] drm/xe: Don't restart parallel queues multiple times on GT reset In case of parallel submissions multiple GuC id will point to the same exec queue and on GT reset such exec queues will get restarted multiple times which is not desirable. v2: don't use exec_queue_enabled() which could race, do the same for xe_guc_submit_stop (Matt B) Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2295 Cc: Jonathan Cavitt Cc: Himal Prasad Ghimiray Cc: Matthew Auld Cc: Matthew Brost Cc: Tejas Upadhyay Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241022103555.731557-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit c8b0acd6d8745fd7e6450f5acc38f0227bd253b3) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc_submit.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 8a9254e5af6e6..d333be9c42277 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1726,8 +1726,13 @@ void xe_guc_submit_stop(struct xe_guc *guc) mutex_lock(&guc->submission_state.lock); - xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) { + /* Prevent redundant attempts to stop parallel queues */ + if (q->guc->id != index) + continue; + guc_exec_queue_stop(guc, q); + } mutex_unlock(&guc->submission_state.lock); @@ -1765,8 +1770,13 @@ int xe_guc_submit_start(struct xe_guc *guc) mutex_lock(&guc->submission_state.lock); atomic_dec(&guc->submission_state.stopped); - xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) { + /* Prevent redundant attempts to start parallel queues */ + if (q->guc->id != index) + continue; + guc_exec_queue_start(q); + } mutex_unlock(&guc->submission_state.lock); wake_up_all(&guc->ct.wq);