Skip to content

Commit

Permalink
drm/xe/guc_pc: Retry and wait longer for GuC PC start
Browse files Browse the repository at this point in the history
In a rare situation of thermal limit during resume, GuC can
be slow and run into delays like this:

xe 0000:00:02.0: [drm] GT1: excessive init time: 667ms! \
   		 [status = 0x8002F034, timeouts = 0]
xe 0000:00:02.0: [drm] GT1: excessive init time: \
   		 [freq = 100MHz (req = 800MHz), before = 100MHz, \
   		 perf_limit_reasons = 0x1C001000]
xe 0000:00:02.0: [drm] *ERROR* GT1: GuC PC Start failed
------------[ cut here ]------------
xe 0000:00:02.0: [drm] GT1: Failed to start GuC PC: -EIO

When this happens, it will block entirely the GPU to be used.
So, let's try and with a huge timeout in the hope it comes back.

Also, let's collect some information on how long it is usually
taking on situations like this, so perhaps the time can be tuned
later.

Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Cc: Jonathan Cavitt <jonathan.cavitt@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250307160307.1093391-1-rodrigo.vivi@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
  • Loading branch information
Rodrigo Vivi committed Mar 10, 2025
1 parent c36e344 commit b4b05e5
Showing 1 changed file with 40 additions and 13 deletions.
53 changes: 40 additions & 13 deletions drivers/gpu/drm/xe/xe_guc_pc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "xe_guc_pc.h"

#include <linux/delay.h>
#include <linux/ktime.h>

#include <drm/drm_managed.h>
#include <drm/drm_print.h>
Expand All @@ -20,6 +21,7 @@
#include "xe_gt.h"
#include "xe_gt_idle.h"
#include "xe_gt_printk.h"
#include "xe_gt_throttle.h"
#include "xe_gt_types.h"
#include "xe_guc.h"
#include "xe_guc_ct.h"
Expand Down Expand Up @@ -50,6 +52,9 @@
#define LNL_MERT_FREQ_CAP 800
#define BMG_MERT_FREQ_CAP 2133

#define SLPC_RESET_TIMEOUT_MS 5 /* roughly 5ms, but no need for precision */
#define SLPC_RESET_EXTENDED_TIMEOUT_MS 1000 /* To be used only at pc_start */

/**
* DOC: GuC Power Conservation (PC)
*
Expand Down Expand Up @@ -114,9 +119,10 @@ static struct iosys_map *pc_to_maps(struct xe_guc_pc *pc)
FIELD_PREP(HOST2GUC_PC_SLPC_REQUEST_MSG_1_EVENT_ARGC, count))

static int wait_for_pc_state(struct xe_guc_pc *pc,
enum slpc_global_state state)
enum slpc_global_state state,
int timeout_ms)
{
int timeout_us = 5000; /* rought 5ms, but no need for precision */
int timeout_us = 1000 * timeout_ms;
int slept, wait = 10;

xe_device_assert_mem_access(pc_to_xe(pc));
Expand Down Expand Up @@ -165,7 +171,8 @@ static int pc_action_query_task_state(struct xe_guc_pc *pc)
};
int ret;

if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING))
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
SLPC_RESET_TIMEOUT_MS))
return -EAGAIN;

/* Blocking here to ensure the results are ready before reading them */
Expand All @@ -188,7 +195,8 @@ static int pc_action_set_param(struct xe_guc_pc *pc, u8 id, u32 value)
};
int ret;

if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING))
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
SLPC_RESET_TIMEOUT_MS))
return -EAGAIN;

ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
Expand All @@ -209,7 +217,8 @@ static int pc_action_unset_param(struct xe_guc_pc *pc, u8 id)
struct xe_guc_ct *ct = &pc_to_guc(pc)->ct;
int ret;

if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING))
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
SLPC_RESET_TIMEOUT_MS))
return -EAGAIN;

ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
Expand Down Expand Up @@ -443,6 +452,15 @@ u32 xe_guc_pc_get_act_freq(struct xe_guc_pc *pc)
return freq;
}

static u32 get_cur_freq(struct xe_gt *gt)
{
u32 freq;

freq = xe_mmio_read32(&gt->mmio, RPNSWREQ);
freq = REG_FIELD_GET(REQ_RATIO_MASK, freq);
return decode_freq(freq);
}

/**
* xe_guc_pc_get_cur_freq - Get Current requested frequency
* @pc: The GuC PC
Expand All @@ -466,10 +484,7 @@ int xe_guc_pc_get_cur_freq(struct xe_guc_pc *pc, u32 *freq)
return -ETIMEDOUT;
}

*freq = xe_mmio_read32(&gt->mmio, RPNSWREQ);

*freq = REG_FIELD_GET(REQ_RATIO_MASK, *freq);
*freq = decode_freq(*freq);
*freq = get_cur_freq(gt);

xe_force_wake_put(gt_to_fw(gt), fw_ref);
return 0;
Expand Down Expand Up @@ -1016,6 +1031,7 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
struct xe_gt *gt = pc_to_gt(pc);
u32 size = PAGE_ALIGN(sizeof(struct slpc_shared_data));
unsigned int fw_ref;
ktime_t earlier;
int ret;

xe_gt_assert(gt, xe_device_uc_enabled(xe));
Expand All @@ -1040,14 +1056,25 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
memset(pc->bo->vmap.vaddr, 0, size);
slpc_shared_data_write(pc, header.size, size);

earlier = ktime_get();
ret = pc_action_reset(pc);
if (ret)
goto out;

if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) {
xe_gt_err(gt, "GuC PC Start failed\n");
ret = -EIO;
goto out;
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
SLPC_RESET_TIMEOUT_MS)) {
xe_gt_warn(gt, "GuC PC start taking longer than normal [freq = %dMHz (req = %dMHz), perf_limit_reasons = 0x%08X]\n",
xe_guc_pc_get_act_freq(pc), get_cur_freq(gt),
xe_gt_throttle_get_limit_reasons(gt));

if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
SLPC_RESET_EXTENDED_TIMEOUT_MS)) {
xe_gt_err(gt, "GuC PC Start failed: Dynamic GT frequency control and GT sleep states are now disabled.\n");
goto out;
}

xe_gt_warn(gt, "GuC PC excessive start time: %lldms",
ktime_ms_delta(ktime_get(), earlier));
}

ret = pc_init_freqs(pc);
Expand Down

0 comments on commit b4b05e5

Please sign in to comment.