Skip to content

Commit

Permalink
drm/i915/huc: Allow for very slow HuC loading
Browse files Browse the repository at this point in the history
A failure to load the HuC is occasionally observed where the cause is
believed to be a low GT frequency leading to very long load times.

So a) increase the timeout so that the user still gets a working
system even in the case of slow load. And b) report the frequency
during the load to see if that is the cause of the slow down.

Also update the similar code on the GuC load to not use uncore->gt
when there is a local gt available. The two should match, but no need
for unnecessary de-referencing.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240102222202.310495-1-John.C.Harrison@Intel.com
  • Loading branch information
John Harrison authored and John Harrison committed Jan 6, 2024
1 parent c5b32a4 commit a797099
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 11 deletions.
10 changes: 5 additions & 5 deletions drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,15 @@ static int guc_wait_ucode(struct intel_guc *guc)
* in the seconds range. However, there is a limit on how long an
* individual wait_for() can wait. So wrap it in a loop.
*/
before_freq = intel_rps_read_actual_frequency(&uncore->gt->rps);
before_freq = intel_rps_read_actual_frequency(&gt->rps);
before = ktime_get();
for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
ret = wait_for(guc_load_done(uncore, &status, &success), 1000);
if (!ret || !success)
break;

guc_dbg(guc, "load still in progress, count = %d, freq = %dMHz, status = 0x%08X [0x%02X/%02X]\n",
count, intel_rps_read_actual_frequency(&uncore->gt->rps), status,
count, intel_rps_read_actual_frequency(&gt->rps), status,
REG_FIELD_GET(GS_BOOTROM_MASK, status),
REG_FIELD_GET(GS_UKERNEL_MASK, status));
}
Expand All @@ -204,7 +204,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);

guc_info(guc, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz, ret = %d\n",
status, delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps), ret);
status, delta_ms, intel_rps_read_actual_frequency(&gt->rps), ret);
guc_info(guc, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
REG_FIELD_GET(GS_MIA_IN_RESET, status),
bootrom, ukernel,
Expand Down Expand Up @@ -254,11 +254,11 @@ static int guc_wait_ucode(struct intel_guc *guc)
guc_warn(guc, "excessive init time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
delta_ms, status, count, ret);
guc_warn(guc, "excessive init time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
intel_rps_read_actual_frequency(&uncore->gt->rps), before_freq,
intel_rps_read_actual_frequency(&gt->rps), before_freq,
intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
} else {
guc_dbg(guc, "init took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps),
delta_ms, intel_rps_read_actual_frequency(&gt->rps),
before_freq, status, count, ret);
}

Expand Down
64 changes: 58 additions & 6 deletions drivers/gpu/drm/i915/gt/uc/intel_huc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <linux/types.h>

#include "gt/intel_gt.h"
#include "gt/intel_rps.h"
#include "intel_guc_reg.h"
#include "intel_huc.h"
#include "intel_huc_print.h"
Expand Down Expand Up @@ -447,17 +448,68 @@ static const char *auth_mode_string(struct intel_huc *huc,
return partial ? "clear media" : "all workloads";
}

/*
* Use a longer timeout for debug builds so that problems can be detected
* and analysed. But a shorter timeout for releases so that user's don't
* wait forever to find out there is a problem. Note that the only reason
* an end user should hit the timeout is in case of extreme thermal throttling.
* And a system that is that hot during boot is probably dead anyway!
*/
#if defined(CONFIG_DRM_I915_DEBUG_GEM)
#define HUC_LOAD_RETRY_LIMIT 20
#else
#define HUC_LOAD_RETRY_LIMIT 3
#endif

int intel_huc_wait_for_auth_complete(struct intel_huc *huc,
enum intel_huc_authentication_type type)
{
struct intel_gt *gt = huc_to_gt(huc);
int ret;
struct intel_uncore *uncore = gt->uncore;
ktime_t before, after, delta;
int ret, count;
u64 delta_ms;
u32 before_freq;

ret = __intel_wait_for_register(gt->uncore,
huc->status[type].reg,
huc->status[type].mask,
huc->status[type].value,
2, 50, NULL);
/*
* The KMD requests maximum frequency during driver load, however thermal
* throttling can force the frequency down to minimum (although the board
* really should never get that hot in real life!). IFWI issues have been
* seen to cause sporadic failures to grant the higher frequency. And at
* minimum frequency, the authentication time can be in the seconds range.
* Note that there is a limit on how long an individual wait_for() can wait.
* So wrap it in a loop.
*/
before_freq = intel_rps_read_actual_frequency(&gt->rps);
before = ktime_get();
for (count = 0; count < HUC_LOAD_RETRY_LIMIT; count++) {
ret = __intel_wait_for_register(gt->uncore,
huc->status[type].reg,
huc->status[type].mask,
huc->status[type].value,
2, 1000, NULL);
if (!ret)
break;

huc_dbg(huc, "auth still in progress, count = %d, freq = %dMHz, status = 0x%08X\n",
count, intel_rps_read_actual_frequency(&gt->rps),
huc->status[type].reg.reg);
}
after = ktime_get();
delta = ktime_sub(after, before);
delta_ms = ktime_to_ms(delta);

if (delta_ms > 50) {
huc_warn(huc, "excessive auth time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
delta_ms, huc->status[type].reg.reg, count, ret);
huc_warn(huc, "excessive auth time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
intel_rps_read_actual_frequency(&gt->rps), before_freq,
intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
} else {
huc_dbg(huc, "auth took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
delta_ms, intel_rps_read_actual_frequency(&gt->rps),
before_freq, huc->status[type].reg.reg, count, ret);
}

/* mark the load process as complete even if the wait failed */
delayed_huc_load_complete(huc);
Expand Down

0 comments on commit a797099

Please sign in to comment.