Skip to content

Commit

Permalink
drm/i915/guc: Dump error capture to dmesg on CTB error
Browse files Browse the repository at this point in the history
In the past, There have been sporadic CTB failures which proved hard
to reproduce manually. The most effective solution was to dump the GuC
log at the point of failure and let the CI system do the repro. It is
preferable not to dump the GuC log via dmesg for all issues as it is
not always necessary and is not helpful for end users. But rather than
trying to re-invent the code to do this each time it is wanted, commit
the code but for DEBUG_GUC builds only.

v2: Use IS_ENABLED for testing config options.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230418181744.3251240-3-John.C.Harrison@Intel.com
  • Loading branch information
John Harrison authored and John Harrison committed May 16, 2023
1 parent 6197cff commit f6eeea8
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 0 deletions.
53 changes: 53 additions & 0 deletions drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,30 @@
#include "intel_guc_ct.h"
#include "intel_guc_print.h"

#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
enum {
CT_DEAD_ALIVE = 0,
CT_DEAD_SETUP,
CT_DEAD_WRITE,
CT_DEAD_DEADLOCK,
CT_DEAD_H2G_HAS_ROOM,
CT_DEAD_READ,
CT_DEAD_PROCESS_FAILED,
};

static void ct_dead_ct_worker_func(struct work_struct *w);

#define CT_DEAD(ct, reason) \
do { \
if (!(ct)->dead_ct_reported) { \
(ct)->dead_ct_reason |= 1 << CT_DEAD_##reason; \
queue_work(system_unbound_wq, &(ct)->dead_ct_worker); \
} \
} while (0)
#else
#define CT_DEAD(ct, reason) do { } while (0)
#endif

static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct)
{
return container_of(ct, struct intel_guc, ct);
Expand Down Expand Up @@ -93,6 +117,9 @@ void intel_guc_ct_init_early(struct intel_guc_ct *ct)
spin_lock_init(&ct->requests.lock);
INIT_LIST_HEAD(&ct->requests.pending);
INIT_LIST_HEAD(&ct->requests.incoming);
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
INIT_WORK(&ct->dead_ct_worker, ct_dead_ct_worker_func);
#endif
INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func);
tasklet_setup(&ct->receive_tasklet, ct_receive_tasklet_func);
init_waitqueue_head(&ct->wq);
Expand Down Expand Up @@ -319,11 +346,16 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)

ct->enabled = true;
ct->stall_time = KTIME_MAX;
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
ct->dead_ct_reported = false;
ct->dead_ct_reason = CT_DEAD_ALIVE;
#endif

return 0;

err_out:
CT_PROBE_ERROR(ct, "Failed to enable CTB (%pe)\n", ERR_PTR(err));
CT_DEAD(ct, SETUP);
return err;
}

Expand Down Expand Up @@ -434,6 +466,7 @@ static int ct_write(struct intel_guc_ct *ct,
corrupted:
CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
desc->head, desc->tail, desc->status);
CT_DEAD(ct, WRITE);
ctb->broken = true;
return -EPIPE;
}
Expand Down Expand Up @@ -504,6 +537,7 @@ static inline bool ct_deadlocked(struct intel_guc_ct *ct)
CT_ERROR(ct, "Head: %u\n (Dwords)", ct->ctbs.recv.desc->head);
CT_ERROR(ct, "Tail: %u\n (Dwords)", ct->ctbs.recv.desc->tail);

CT_DEAD(ct, DEADLOCK);
ct->ctbs.send.broken = true;
}

Expand Down Expand Up @@ -552,6 +586,7 @@ static inline bool h2g_has_room(struct intel_guc_ct *ct, u32 len_dw)
head, ctb->size);
desc->status |= GUC_CTB_STATUS_OVERFLOW;
ctb->broken = true;
CT_DEAD(ct, H2G_HAS_ROOM);
return false;
}

Expand Down Expand Up @@ -914,6 +949,7 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
desc->head, desc->tail, desc->status);
ctb->broken = true;
CT_DEAD(ct, READ);
return -EPIPE;
}

Expand Down Expand Up @@ -1063,6 +1099,7 @@ static bool ct_process_incoming_requests(struct intel_guc_ct *ct)
if (unlikely(err)) {
CT_ERROR(ct, "Failed to process CT message (%pe) %*ph\n",
ERR_PTR(err), 4 * request->size, request->msg);
CT_DEAD(ct, PROCESS_FAILED);
ct_free_msg(request);
}

Expand Down Expand Up @@ -1239,3 +1276,19 @@ void intel_guc_ct_print_info(struct intel_guc_ct *ct,
drm_printf(p, "Tail: %u\n",
ct->ctbs.recv.desc->tail);
}

#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
static void ct_dead_ct_worker_func(struct work_struct *w)
{
struct intel_guc_ct *ct = container_of(w, struct intel_guc_ct, dead_ct_worker);
struct intel_guc *guc = ct_to_guc(ct);

if (ct->dead_ct_reported)
return;

ct->dead_ct_reported = true;

guc_info(guc, "CTB is dead - reason=0x%X\n", ct->dead_ct_reason);
intel_klog_error_capture(guc_to_gt(guc), (intel_engine_mask_t)~0U);
}
#endif
6 changes: 6 additions & 0 deletions drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ struct intel_guc_ct {

/** @stall_time: time of first time a CTB submission is stalled */
ktime_t stall_time;

#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
int dead_ct_reason;
bool dead_ct_reported;
struct work_struct dead_ct_worker;
#endif
};

void intel_guc_ct_init_early(struct intel_guc_ct *ct);
Expand Down

0 comments on commit f6eeea8

Please sign in to comment.