Skip to content

Commit

Permalink
drm/i915: Improve record of hung engines in error state
Browse files Browse the repository at this point in the history
Between events which trigger engine and GPU resets and capturing the error
state we lose information on which engine triggered the reset. Improve
this by passing in the hung engine mask down to error capture.

Result is that the list of engines in user visible "GPU HANG: ecode
<gen>:<engines>:<ecode>, <process>" is now a list of hanging and not just
active engines. Most importantly the displayed process is now the one
which was actually hung.

v2:
 * Stub prototype. (Chris)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20201104134743.916027-1-tvrtko.ursulin@linux.intel.com
  • Loading branch information
Tvrtko Ursulin committed Nov 9, 2020
1 parent ad18fa0 commit bda3002
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 17 deletions.
2 changes: 2 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_lrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3037,6 +3037,8 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
if (!cap->error->gt->engine)
goto err_gt;

cap->error->gt->engine->hung = true;

return cap;

err_gt:
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/gt/intel_reset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1251,7 +1251,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
engine_mask &= gt->info.engine_mask;

if (flags & I915_ERROR_CAPTURE) {
i915_capture_error_state(gt->i915);
i915_capture_error_state(gt, engine_mask);
intel_gt_clear_error_registers(gt, engine_mask);
}

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/i915_debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)

gpu = NULL;
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
gpu = i915_gpu_coredump(i915);
gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES);
if (IS_ERR(gpu))
return PTR_ERR(gpu);

Expand Down
35 changes: 23 additions & 12 deletions drivers/gpu/drm/i915/i915_gpu_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
ee->vm_info.pp_dir_base);
}
}
err_printf(m, " hung: %u\n", ee->hung);
err_printf(m, " engine reset count: %u\n", ee->reset_count);

for (n = 0; n < ee->num_ports; n++) {
Expand Down Expand Up @@ -1456,6 +1457,7 @@ capture_engine(struct intel_engine_cs *engine,

static void
gt_record_engines(struct intel_gt_coredump *gt,
intel_engine_mask_t engine_mask,
struct i915_vma_compress *compress)
{
struct intel_engine_cs *engine;
Expand All @@ -1471,6 +1473,8 @@ gt_record_engines(struct intel_gt_coredump *gt,
if (!ee)
continue;

ee->hung = engine->mask & engine_mask;

gt->simulated |= ee->simulated;
if (ee->simulated) {
kfree(ee);
Expand Down Expand Up @@ -1663,11 +1667,13 @@ static const char *error_msg(struct i915_gpu_coredump *error)
for (gt = error->gt; gt; gt = gt->next) {
struct intel_engine_coredump *cs;

if (gt->engine && !first)
first = gt->engine;

for (cs = gt->engine; cs; cs = cs->next)
engines |= cs->engine->mask;
for (cs = gt->engine; cs; cs = cs->next) {
if (cs->hung) {
engines |= cs->engine->mask;
if (!first)
first = cs;
}
}
}

len = scnprintf(error->error_msg, sizeof(error->error_msg),
Expand Down Expand Up @@ -1781,8 +1787,10 @@ void i915_vma_capture_finish(struct intel_gt_coredump *gt,
kfree(compress);
}

struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
struct i915_gpu_coredump *
i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask)
{
struct drm_i915_private *i915 = gt->i915;
struct i915_gpu_coredump *error;

/* Check if GPU capture has been disabled */
Expand All @@ -1794,7 +1802,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
if (!error)
return ERR_PTR(-ENOMEM);

error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL);
error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL);
if (error->gt) {
struct i915_vma_compress *compress;

Expand All @@ -1806,7 +1814,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
}

gt_record_info(error->gt);
gt_record_engines(error->gt, compress);
gt_record_engines(error->gt, engine_mask, compress);

if (INTEL_INFO(i915)->has_gt_uc)
error->gt->uc = gt_record_uc(error->gt, compress);
Expand Down Expand Up @@ -1853,20 +1861,23 @@ void i915_error_state_store(struct i915_gpu_coredump *error)

/**
* i915_capture_error_state - capture an error record for later analysis
* @i915: i915 device
* @gt: intel_gt which originated the hang
* @engine_mask: hung engines
*
*
* Should be called when an error is detected (either a hang or an error
* interrupt) to capture error state from the time of the error. Fills
* out a structure which becomes available in debugfs for user level tools
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *i915)
void i915_capture_error_state(struct intel_gt *gt,
intel_engine_mask_t engine_mask)
{
struct i915_gpu_coredump *error;

error = i915_gpu_coredump(i915);
error = i915_gpu_coredump(gt, engine_mask);
if (IS_ERR(error)) {
cmpxchg(&i915->gpu_error.first_error, NULL, error);
cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
return;
}

Expand Down
10 changes: 7 additions & 3 deletions drivers/gpu/drm/i915/i915_gpu_error.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ struct i915_request_coredump {
struct intel_engine_coredump {
const struct intel_engine_cs *engine;

bool hung;
bool simulated;
u32 reset_count;

Expand Down Expand Up @@ -218,8 +219,10 @@ struct drm_i915_error_state_buf {
__printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);

struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *i915);
struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
intel_engine_mask_t engine_mask);
void i915_capture_error_state(struct intel_gt *gt,
intel_engine_mask_t engine_mask);

struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
Expand Down Expand Up @@ -271,7 +274,8 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err);

#else

static inline void i915_capture_error_state(struct drm_i915_private *i915)
static inline void
i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask)
{
}

Expand Down

0 comments on commit bda3002

Please sign in to comment.