Skip to content

Commit

Permalink
drm/i915/guc: Plumb GuC-capture into gpu_coredump
Browse files Browse the repository at this point in the history
Add a flags parameter through all of the coredump creation
functions. Add a bitmask flag to indicate if the top
level gpu_coredump event is triggered in response to
a GuC context reset notification.

Using that flag, ensure all coredump functions that
read or print mmio-register values related to work submission
or command-streamer engines are skipped and replaced with
a calls guc-capture module equivalent functions to retrieve
or print the register dump.

While here, split out display related register reading
and printing into its own function that is called agnostic
to whether GuC had triggered the reset.

For now, introduce an empty printing function that can
filled in on a subsequent patch just to handle formatting.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220321164527.2500062-13-alan.previn.teres.alexis@intel.com
  • Loading branch information
Alan Previn authored and Lucas De Marchi committed Mar 22, 2022
1 parent 247f807 commit a6f0f9c
Show file tree
Hide file tree
Showing 8 changed files with 288 additions and 97 deletions.
4 changes: 2 additions & 2 deletions drivers/gpu/drm/i915/gt/intel_execlists_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -2229,11 +2229,11 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
if (!cap->error)
goto err_cap;

cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp, CORE_DUMP_FLAG_NONE);
if (!cap->error->gt)
goto err_gpu;

cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp, CORE_DUMP_FLAG_NONE);
if (!cap->error->gt->engine)
goto err_gt;

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/gt/intel_reset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1318,7 +1318,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
engine_mask &= gt->info.engine_mask;

if (flags & I915_ERROR_CAPTURE) {
i915_capture_error_state(gt, engine_mask);
i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
intel_gt_clear_error_registers(gt, engine_mask);
}

Expand Down
70 changes: 70 additions & 0 deletions drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "gt/intel_engine_regs.h"
#include "gt/intel_gt.h"
#include "gt/intel_gt_regs.h"
#include "gt/intel_lrc.h"
#include "guc_capture_fwif.h"
#include "intel_guc_capture.h"
#include "intel_guc_fwif.h"
Expand Down Expand Up @@ -755,6 +756,18 @@ intel_guc_capture_output_min_size_est(struct intel_guc *guc)
* data from GuC and then it's added into guc->capture->outlist linked
* list. This list is used for matchup and printout by i915_gpu_coredump
* and err_print_gt, (when user invokes the error capture sysfs).
*
* GUC --> notify context reset:
* -----------------------------
* --> G2H CONTEXT RESET
* L--> guc_handle_context_reset --> i915_capture_error_state
* L--> i915_gpu_coredump(..IS_GUC_CAPTURE) --> gt_record_engines
* --> capture_engine(..IS_GUC_CAPTURE)
* L--> intel_guc_capture_get_matching_node is where
* detach C from internal linked list and add it into
* intel_engine_coredump struct (if the context and
* engine of the event notification matches a node
* in the link list).
*/

static int guc_capture_buf_cnt(struct __guc_capture_bufstate *buf)
Expand Down Expand Up @@ -1370,6 +1383,63 @@ static void __guc_capture_process_output(struct intel_guc *guc)
__guc_capture_flushlog_complete(guc);
}

#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)

int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *ebuf,
const struct intel_engine_coredump *ee)
{
return 0;
}

#endif //CONFIG_DRM_I915_CAPTURE_ERROR

void intel_guc_capture_free_node(struct intel_engine_coredump *ee)
{
if (!ee || !ee->guc_capture_node)
return;

guc_capture_add_node_to_cachelist(ee->capture, ee->guc_capture_node);
ee->capture = NULL;
ee->guc_capture_node = NULL;
}

void intel_guc_capture_get_matching_node(struct intel_gt *gt,
struct intel_engine_coredump *ee,
struct intel_context *ce)
{
struct __guc_capture_parsed_output *n, *ntmp;
struct drm_i915_private *i915;
struct intel_guc *guc;

if (!gt || !ee || !ce)
return;

i915 = gt->i915;
guc = &gt->uc.guc;
if (!guc->capture)
return;

GEM_BUG_ON(ee->guc_capture_node);
/*
* Look for a matching GuC reported error capture node from
* the internal output link-list based on lrca, guc-id and engine
* identification.
*/
list_for_each_entry_safe(n, ntmp, &guc->capture->outlist, link) {
if (n->eng_inst == GUC_ID_TO_ENGINE_INSTANCE(ee->engine->guc_id) &&
n->eng_class == GUC_ID_TO_ENGINE_CLASS(ee->engine->guc_id) &&
n->guc_id && n->guc_id == ce->guc_id.id &&
(n->lrca & CTX_GTT_ADDRESS_MASK) && (n->lrca & CTX_GTT_ADDRESS_MASK) ==
(ce->lrc.lrca & CTX_GTT_ADDRESS_MASK)) {
list_del(&n->link);
ee->guc_capture_node = n;
ee->capture = guc->capture;
return;
}
}
drm_dbg(&i915->drm, "GuC capture can't match ee to node\n");
}

void intel_guc_capture_process(struct intel_guc *guc)
{
if (guc->capture)
Expand Down
9 changes: 9 additions & 0 deletions drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,18 @@

#include <linux/types.h>

struct drm_i915_error_state_buf;
struct guc_gt_system_info;
struct intel_context;
struct intel_engine_coredump;
struct intel_gt;
struct intel_guc;

void intel_guc_capture_free_node(struct intel_engine_coredump *ee);
int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *m,
const struct intel_engine_coredump *ee);
void intel_guc_capture_get_matching_node(struct intel_gt *gt, struct intel_engine_coredump *ee,
struct intel_context *ce);
void intel_guc_capture_process(struct intel_guc *guc);
int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -4031,7 +4031,7 @@ static void capture_error_state(struct intel_guc *guc,

intel_engine_set_hung_context(engine, ce);
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
i915_capture_error_state(gt, engine->mask);
i915_capture_error_state(gt, engine->mask, CORE_DUMP_FLAG_IS_GUC_CAPTURE);
atomic_inc(&i915->gpu_error.reset_engine_count[engine->uabi_class]);
}

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/i915_debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)

gpu = NULL;
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
gpu = i915_gpu_coredump(to_gt(i915), ALL_ENGINES);
gpu = i915_gpu_coredump(to_gt(i915), ALL_ENGINES, CORE_DUMP_FLAG_NONE);
if (IS_ERR(gpu))
return PTR_ERR(gpu);

Expand Down
Loading

0 comments on commit a6f0f9c

Please sign in to comment.