Skip to content

Commit

Permalink
drm/msm: add basic hangcheck/recovery mechanism
Browse files Browse the repository at this point in the history
A basic, no-frills recovery mechanism in case the gpu gets wedged.  We
could try to be a bit more fancy and restart the next submit after the
one that got wedged, but for now keep it simple.  This is enough to
recover things if, for example, the gpu hangs mid way through a piglit
run.

Signed-off-by: Rob Clark <robdclark@gmail.com>
  • Loading branch information
Rob Clark committed Aug 24, 2013
1 parent 7198e6b commit bd6f82d
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 5 deletions.
1 change: 1 addition & 0 deletions drivers/gpu/drm/msm/adreno/a3xx_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ static const struct adreno_gpu_funcs funcs = {
.hw_init = a3xx_hw_init,
.pm_suspend = msm_gpu_pm_suspend,
.pm_resume = msm_gpu_pm_resume,
.recover = adreno_recover,
.last_fence = adreno_last_fence,
.submit = adreno_submit,
.flush = adreno_flush,
Expand Down
26 changes: 23 additions & 3 deletions drivers/gpu/drm/msm/adreno/adreno_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,28 @@ uint32_t adreno_last_fence(struct msm_gpu *gpu)
return adreno_gpu->memptrs->fence;
}

void adreno_recover(struct msm_gpu *gpu)
{
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
struct drm_device *dev = gpu->dev;
int ret;

gpu->funcs->pm_suspend(gpu);

/* reset ringbuffer: */
gpu->rb->cur = gpu->rb->start;

/* reset completed fence seqno, just discard anything pending: */
adreno_gpu->memptrs->fence = gpu->submitted_fence;

gpu->funcs->pm_resume(gpu);
ret = gpu->funcs->hw_init(gpu);
if (ret) {
dev_err(dev->dev, "gpu hw init failed: %d\n", ret);
/* hmm, oh well? */
}
}

int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
struct msm_file_private *ctx)
{
Expand All @@ -119,8 +141,6 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
struct msm_ringbuffer *ring = gpu->rb;
unsigned i, ibs = 0;

adreno_gpu->last_fence = submit->fence;

for (i = 0; i < submit->nr_cmds; i++) {
switch (submit->cmd[i].type) {
case MSM_SUBMIT_CMD_IB_TARGET_BUF:
Expand Down Expand Up @@ -225,7 +245,7 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
adreno_gpu->rev.patchid);

seq_printf(m, "fence: %d/%d\n", adreno_gpu->memptrs->fence,
adreno_gpu->last_fence);
gpu->submitted_fence);
seq_printf(m, "rptr: %d\n", adreno_gpu->memptrs->rptr);
seq_printf(m, "wptr: %d\n", adreno_gpu->memptrs->wptr);
seq_printf(m, "rb wptr: %d\n", get_wptr(gpu->rb));
Expand Down
3 changes: 1 addition & 2 deletions drivers/gpu/drm/msm/adreno/adreno_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ struct adreno_gpu {
uint32_t revn; /* numeric revision name */
const struct adreno_gpu_funcs *funcs;

uint32_t last_fence;

/* firmware: */
const struct firmware *pm4, *pfp;

Expand Down Expand Up @@ -99,6 +97,7 @@ static inline bool adreno_is_a330(struct adreno_gpu *gpu)
int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value);
int adreno_hw_init(struct msm_gpu *gpu);
uint32_t adreno_last_fence(struct msm_gpu *gpu);
void adreno_recover(struct msm_gpu *gpu);
int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
struct msm_file_private *ctx);
void adreno_flush(struct msm_gpu *gpu);
Expand Down
52 changes: 52 additions & 0 deletions drivers/gpu/drm/msm/msm_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,51 @@ int msm_gpu_pm_suspend(struct msm_gpu *gpu)
return 0;
}

/*
* Hangcheck detection for locked gpu:
*/

static void recover_worker(struct work_struct *work)
{
struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
struct drm_device *dev = gpu->dev;

dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name);

mutex_lock(&dev->struct_mutex);
gpu->funcs->recover(gpu);
mutex_unlock(&dev->struct_mutex);

msm_gpu_retire(gpu);
}

static void hangcheck_timer_reset(struct msm_gpu *gpu)
{
DBG("%s", gpu->name);
mod_timer(&gpu->hangcheck_timer,
round_jiffies_up(jiffies + DRM_MSM_HANGCHECK_JIFFIES));
}

static void hangcheck_handler(unsigned long data)
{
struct msm_gpu *gpu = (struct msm_gpu *)data;
uint32_t fence = gpu->funcs->last_fence(gpu);

if (fence != gpu->hangcheck_fence) {
/* some progress has been made.. ya! */
gpu->hangcheck_fence = fence;
} else if (fence < gpu->submitted_fence) {
/* no progress and not done.. hung! */
struct msm_drm_private *priv = gpu->dev->dev_private;
gpu->hangcheck_fence = fence;
queue_work(priv->wq, &gpu->recover_work);
}

/* if still more pending work, reset the hangcheck timer: */
if (gpu->submitted_fence > gpu->hangcheck_fence)
hangcheck_timer_reset(gpu);
}

/*
* Cmdstream submission/retirement:
*/
Expand Down Expand Up @@ -254,6 +299,8 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,

submit->fence = ++priv->next_fence;

gpu->submitted_fence = submit->fence;

ret = gpu->funcs->submit(gpu, submit, ctx);
priv->lastctx = ctx;

Expand All @@ -276,6 +323,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,

msm_gem_move_to_active(&msm_obj->base, gpu, submit->fence);
}
hangcheck_timer_reset(gpu);
mutex_unlock(&dev->struct_mutex);

return ret;
Expand Down Expand Up @@ -307,6 +355,10 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,

INIT_LIST_HEAD(&gpu->active_list);
INIT_WORK(&gpu->retire_work, retire_worker);
INIT_WORK(&gpu->recover_work, recover_worker);

setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
(unsigned long)gpu);

BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));

Expand Down
10 changes: 10 additions & 0 deletions drivers/gpu/drm/msm/msm_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ struct msm_gpu_funcs {
void (*idle)(struct msm_gpu *gpu);
irqreturn_t (*irq)(struct msm_gpu *irq);
uint32_t (*last_fence)(struct msm_gpu *gpu);
void (*recover)(struct msm_gpu *gpu);
void (*destroy)(struct msm_gpu *gpu);
#ifdef CONFIG_DEBUG_FS
/* show GPU status in debugfs: */
Expand All @@ -69,6 +70,8 @@ struct msm_gpu {
/* list of GEM active objects: */
struct list_head active_list;

uint32_t submitted_fence;

/* worker for handling active-list retiring: */
struct work_struct retire_work;

Expand All @@ -83,6 +86,13 @@ struct msm_gpu {
struct clk *ebi1_clk, *grp_clks[5];
uint32_t fast_rate, slow_rate, bus_freq;
uint32_t bsc;

/* Hang Detction: */
#define DRM_MSM_HANGCHECK_PERIOD 500 /* in ms */
#define DRM_MSM_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_MSM_HANGCHECK_PERIOD)
struct timer_list hangcheck_timer;
uint32_t hangcheck_fence;
struct work_struct recover_work;
};

static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
Expand Down

0 comments on commit bd6f82d

Please sign in to comment.