Skip to content

Commit

Permalink
drm/xe/guc: Plumb GuC-capture into dev coredump
Browse files Browse the repository at this point in the history
When we decide to kill a job, (from guc_exec_queue_timedout_job), we could
end up with 4 possible scenarios at this starting point of this decision:
1. the guc-captured register-dump is already there.
2. the driver is wedged.mode > 1, so GuC-engine-reset / GuC-err-capture
   will not happen.
3. the user has started the driver in execlist-submission mode.
4. the guc-captured register-dump is not ready yet so we force GuC to kill
   that context now, but:
     A. we don't know yet if GuC will be successful on the engine-reset
        and get the guc-err-capture, else kmd will do a manual reset later
     OR B. guc will be successful and we will get a guc-err-capture
           shortly.

So to accomdate the scenarios of 2 and 4A, we will need to do a manual KMD
capture first(which is not be reliable in guc-submission mode) and decide
later if we need to use that for the cases of 2 or 4A. So this flow is
part of the implementation for this patch.

Provide xe_guc_capture_get_reg_desc_list to get the register dscriptor
list.
Add manual capture by read from hw engine if GuC capture is not ready.
If it becomes ready at later time, GuC sourced data will be used.

Although there may only be a small delay between (1) the check for whether
guc-err-capture is available at the start of guc_exec_queue_timedout_job
and (2) the decision on using a valid guc-err-capture or manual-capture,
lets not take any chances and lock the matching node down so it doesn't
get re-claimed if GuC-Err-Capture subsystem is running out of pre-cached
nodes.

Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
Reviewed-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241004193428.3311145-6-zhanjun.dong@intel.com
  • Loading branch information
Zhanjun Dong authored and Matt Roper committed Oct 8, 2024
1 parent 8bfc496 commit ecb6336
Show file tree
Hide file tree
Showing 12 changed files with 508 additions and 103 deletions.
17 changes: 7 additions & 10 deletions drivers/gpu/drm/xe/xe_devcoredump.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "xe_force_wake.h"
#include "xe_gt.h"
#include "xe_gt_printk.h"
#include "xe_guc_capture.h"
#include "xe_guc_ct.h"
#include "xe_guc_log.h"
#include "xe_guc_submit.h"
Expand Down Expand Up @@ -134,6 +135,9 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
xe_guc_ct_snapshot_free(ss->guc.ct);
ss->guc.ct = NULL;

xe_guc_capture_put_matched_nodes(&ss->gt->uc.guc);
ss->matched_node = NULL;

xe_guc_exec_queue_snapshot_free(ss->ge);
ss->ge = NULL;

Expand Down Expand Up @@ -217,6 +221,7 @@ static void xe_devcoredump_free(void *data)
/* To prevent stale data on next snapshot, clear everything */
memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
coredump->captured = false;
coredump->job = NULL;
drm_info(&coredump_to_xe(coredump)->drm,
"Xe device coredump has been deleted.\n");
}
Expand All @@ -227,8 +232,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
struct xe_exec_queue *q = job->q;
struct xe_guc *guc = exec_queue_to_guc(q);
struct xe_hw_engine *hwe;
enum xe_hw_engine_id id;
u32 adj_logical_mask = q->logical_mask;
u32 width_mask = (0x1 << q->width) - 1;
const char *process_name = "no process";
Expand All @@ -244,6 +247,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
strscpy(ss->process_name, process_name);

ss->gt = q->gt;
coredump->job = job;
INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);

cookie = dma_fence_begin_signalling();
Expand All @@ -266,14 +270,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
ss->job = xe_sched_job_snapshot_capture(job);
ss->vm = xe_vm_snapshot_capture(q->vm);

for_each_hw_engine(hwe, q->gt, id) {
if (hwe->class != q->hwe->class ||
!(BIT(hwe->logical_instance) & adj_logical_mask)) {
ss->hwe[id] = NULL;
continue;
}
ss->hwe[id] = xe_hw_engine_snapshot_capture(hwe);
}
xe_engine_snapshot_capture_for_job(job);

queue_work(system_unbound_wq, &ss->work);

Expand Down
8 changes: 8 additions & 0 deletions drivers/gpu/drm/xe/xe_devcoredump_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ struct xe_devcoredump_snapshot {
struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
/** @job: Snapshot of job state */
struct xe_sched_job_snapshot *job;
/**
* @matched_node: The matched capture node for timedout job
* this single-node tracker works because devcoredump will always only
* produce one hw-engine capture per devcoredump event
*/
struct __guc_capture_parsed_output *matched_node;
/** @vm: Snapshot of VM state */
struct xe_vm_snapshot *vm;

Expand All @@ -74,6 +80,8 @@ struct xe_devcoredump {
bool captured;
/** @snapshot: Snapshot is captured at time of the first crash */
struct xe_devcoredump_snapshot snapshot;
/** @job: Point to the faulting job */
struct xe_sched_job *job;
};

#endif
13 changes: 13 additions & 0 deletions drivers/gpu/drm/xe/xe_gt_mcr.c
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,19 @@ void xe_gt_mcr_get_dss_steering(struct xe_gt *gt, unsigned int dss, u16 *group,
*instance = dss % gt->steering_dss_per_grp;
}

/**
* xe_gt_mcr_steering_info_to_dss_id - Get DSS ID from group/instance steering
* @gt: GT structure
* @group: steering group ID
* @instance: steering instance ID
*
* Return: the coverted DSS id.
*/
u32 xe_gt_mcr_steering_info_to_dss_id(struct xe_gt *gt, u16 group, u16 instance)
{
return group * dss_per_group(gt) + instance;
}

static void init_steering_dss(struct xe_gt *gt)
{
gt->steering_dss_per_grp = dss_per_group(gt);
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/xe/xe_gt_mcr.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ void xe_gt_mcr_multicast_write(struct xe_gt *gt, struct xe_reg_mcr mcr_reg,

void xe_gt_mcr_steering_dump(struct xe_gt *gt, struct drm_printer *p);
void xe_gt_mcr_get_dss_steering(struct xe_gt *gt, unsigned int dss, u16 *group, u16 *instance);
u32 xe_gt_mcr_steering_info_to_dss_id(struct xe_gt *gt, u16 group, u16 instance);

/*
* Loop over each DSS and determine the group and instance IDs that
Expand Down
Loading

0 comments on commit ecb6336

Please sign in to comment.