diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index f3eb68a76a258..8a51c1c3a0911 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -3037,6 +3037,8 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) if (!cap->error->gt->engine) goto err_gt; + cap->error->gt->engine->hung = true; + return cap; err_gt: diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 4e5e13dc95da2..9fb4306b2900e 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1251,7 +1251,7 @@ void intel_gt_handle_error(struct intel_gt *gt, engine_mask &= gt->info.engine_mask; if (flags & I915_ERROR_CAPTURE) { - i915_capture_error_state(gt->i915); + i915_capture_error_state(gt, engine_mask); intel_gt_clear_error_registers(gt, engine_mask); } diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index ea469168cd443..a727552d2bc6a 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -725,7 +725,7 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file) gpu = NULL; with_intel_runtime_pm(&i915->runtime_pm, wakeref) - gpu = i915_gpu_coredump(i915); + gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES); if (IS_ERR(gpu)) return PTR_ERR(gpu); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index ae370909e8ddd..994738d974cce 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -570,6 +570,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, ee->vm_info.pp_dir_base); } } + err_printf(m, " hung: %u\n", ee->hung); err_printf(m, " engine reset count: %u\n", ee->reset_count); for (n = 0; n < ee->num_ports; n++) { @@ -1456,6 +1457,7 @@ capture_engine(struct intel_engine_cs *engine, static void gt_record_engines(struct intel_gt_coredump *gt, + intel_engine_mask_t engine_mask, struct i915_vma_compress *compress) { struct intel_engine_cs *engine; @@ -1471,6 +1473,8 @@ gt_record_engines(struct intel_gt_coredump *gt, if (!ee) continue; + ee->hung = engine->mask & engine_mask; + gt->simulated |= ee->simulated; if (ee->simulated) { kfree(ee); @@ -1663,11 +1667,13 @@ static const char *error_msg(struct i915_gpu_coredump *error) for (gt = error->gt; gt; gt = gt->next) { struct intel_engine_coredump *cs; - if (gt->engine && !first) - first = gt->engine; - - for (cs = gt->engine; cs; cs = cs->next) - engines |= cs->engine->mask; + for (cs = gt->engine; cs; cs = cs->next) { + if (cs->hung) { + engines |= cs->engine->mask; + if (!first) + first = cs; + } + } } len = scnprintf(error->error_msg, sizeof(error->error_msg), @@ -1781,8 +1787,10 @@ void i915_vma_capture_finish(struct intel_gt_coredump *gt, kfree(compress); } -struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) +struct i915_gpu_coredump * +i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask) { + struct drm_i915_private *i915 = gt->i915; struct i915_gpu_coredump *error; /* Check if GPU capture has been disabled */ @@ -1794,7 +1802,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) if (!error) return ERR_PTR(-ENOMEM); - error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL); + error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL); if (error->gt) { struct i915_vma_compress *compress; @@ -1806,7 +1814,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) } gt_record_info(error->gt); - gt_record_engines(error->gt, compress); + gt_record_engines(error->gt, engine_mask, compress); if (INTEL_INFO(i915)->has_gt_uc) error->gt->uc = gt_record_uc(error->gt, compress); @@ -1853,20 +1861,23 @@ void i915_error_state_store(struct i915_gpu_coredump *error) /** * i915_capture_error_state - capture an error record for later analysis - * @i915: i915 device + * @gt: intel_gt which originated the hang + * @engine_mask: hung engines + * * * Should be called when an error is detected (either a hang or an error * interrupt) to capture error state from the time of the error. Fills * out a structure which becomes available in debugfs for user level tools * to pick up. */ -void i915_capture_error_state(struct drm_i915_private *i915) +void i915_capture_error_state(struct intel_gt *gt, + intel_engine_mask_t engine_mask) { struct i915_gpu_coredump *error; - error = i915_gpu_coredump(i915); + error = i915_gpu_coredump(gt, engine_mask); if (IS_ERR(error)) { - cmpxchg(&i915->gpu_error.first_error, NULL, error); + cmpxchg(>->i915->gpu_error.first_error, NULL, error); return; } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 0220b09928080..16bc42de4b840 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -59,6 +59,7 @@ struct i915_request_coredump { struct intel_engine_coredump { const struct intel_engine_cs *engine; + bool hung; bool simulated; u32 reset_count; @@ -218,8 +219,10 @@ struct drm_i915_error_state_buf { __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); -struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915); -void i915_capture_error_state(struct drm_i915_private *i915); +struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt, + intel_engine_mask_t engine_mask); +void i915_capture_error_state(struct intel_gt *gt, + intel_engine_mask_t engine_mask); struct i915_gpu_coredump * i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp); @@ -271,7 +274,8 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err); #else -static inline void i915_capture_error_state(struct drm_i915_private *i915) +static inline void +i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask) { }