Skip to content

Commit

Permalink
drm/i915: Don't claim an unstarted request was guilty
Browse files Browse the repository at this point in the history
If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.
v3: Preserve semaphores on skipping requests (need to keep the timelines
intact).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-7-chris@chris-wilson.co.uk
  • Loading branch information
Chris Wilson committed Feb 8, 2019
1 parent 72eb16d commit 21182b3
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 3 deletions.
42 changes: 39 additions & 3 deletions drivers/gpu/drm/i915/intel_lrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
*cs++ = rq->fence.seqno - 1;

intel_ring_advance(rq, cs);

/* Record the updated position of the request's payload */
rq->infix = intel_ring_offset(rq, cs);

return 0;
}

Expand Down Expand Up @@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
spin_unlock_irqrestore(&engine->timeline.lock, flags);
}

static bool lrc_regs_ok(const struct i915_request *rq)
{
const struct intel_ring *ring = rq->ring;
const u32 *regs = rq->hw_context->lrc_reg_state;

/* Quick spot check for the common signs of context corruption */

if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
(RING_CTL_SIZE(ring->size) | RING_VALID))
return false;

if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
return false;

return true;
}

static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
Expand Down Expand Up @@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
if (!rq)
goto out_unlock;

/*
* If this request hasn't started yet, e.g. it is waiting on a
* semaphore, we need to avoid skipping the request or else we
* break the signaling chain. However, if the context is corrupt
* the request will not restart and we will be stuck with a wedged
* device. It is quite often the case that if we issue a reset
* while the GPU is loading the context image, that the context
* image becomes corrupt.
*
* Otherwise, if we have not started yet, the request should replay
* perfectly and we do not need to flag the result as being erroneous.
*/
if (!i915_request_started(rq) && lrc_regs_ok(rq))
goto out_unlock;

/*
* If the request was innocent, we leave the request in the ELSP
* and will try to replay it on restarting. The context image may
Expand All @@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
* image back to the expected values to skip over the guilty request.
*/
i915_reset_request(rq, stalled);
if (!stalled)
if (!stalled && lrc_regs_ok(rq))
goto out_unlock;

/*
Expand All @@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
engine->context_size - PAGE_SIZE);
}

/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
/* Rerun the request; its payload has been neutered (if guilty). */
rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
intel_ring_update_space(rq->ring);

execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
Expand Down
7 changes: 7 additions & 0 deletions drivers/gpu/drm/i915/selftests/igt_spinner.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ igt_spinner_create_request(struct igt_spinner *spin,

i915_gem_chipset_flush(spin->i915);

if (engine->emit_init_breadcrumb &&
rq->timeline->has_initial_breadcrumb) {
err = engine->emit_init_breadcrumb(rq);
if (err)
goto cancel_rq;
}

err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);

cancel_rq:
Expand Down
6 changes: 6 additions & 0 deletions drivers/gpu/drm/i915/selftests/intel_hangcheck.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
*batch++ = MI_BATCH_BUFFER_END; /* not reached */
i915_gem_chipset_flush(h->i915);

if (rq->engine->emit_init_breadcrumb) {
err = rq->engine->emit_init_breadcrumb(rq);
if (err)
goto cancel_rq;
}

flags = 0;
if (INTEL_GEN(vm->i915) <= 5)
flags |= I915_DISPATCH_SECURE;
Expand Down

0 comments on commit 21182b3

Please sign in to comment.