Skip to content

Commit

Permalink
drm/i915/gt: Allow temporary suspension of inflight requests
Browse files Browse the repository at this point in the history
In order to support out-of-line error capture, we need to remove the
active request from HW and put it to one side while a worker compresses
and stores all the details associated with that request. (As that
compression may take an arbitrary user-controlled amount of time, we
want to let the engine continue running on other workloads while the
hanging request is dumped.) Not only do we need to remove the active
request, but we also have to remove its context and all requests that
were dependent on it (both in flight, queued and future submission).

Finally once the capture is complete, we need to be able to resubmit the
request and its dependents and allow them to execute.

v2: Replace stack recursion with a simple list.
v3: Check all the parents, not just the first, when searching for a
stuck ancestor!

References: https://gitlab.freedesktop.org/drm/intel/issues/738
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200116184754.2860848-2-chris@chris-wilson.co.uk
  • Loading branch information
Chris Wilson committed Jan 16, 2020
1 parent 672c368 commit 32ff621
Show file tree
Hide file tree
Showing 5 changed files with 321 additions and 6 deletions.
13 changes: 13 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_engine_cs.c
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,7 @@ void
intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass)
{
INIT_LIST_HEAD(&engine->active.requests);
INIT_LIST_HEAD(&engine->active.hold);

spin_lock_init(&engine->active.lock);
lockdep_set_subclass(&engine->active.lock, subclass);
Expand Down Expand Up @@ -1422,6 +1423,17 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq)
}
}

static unsigned long list_count(struct list_head *list)
{
struct list_head *pos;
unsigned long count = 0;

list_for_each(pos, list)
count++;

return count;
}

void intel_engine_dump(struct intel_engine_cs *engine,
struct drm_printer *m,
const char *header, ...)
Expand Down Expand Up @@ -1491,6 +1503,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
}
}
drm_printf(m, "\tOn hold?: %lu\n", list_count(&engine->active.hold));
spin_unlock_irqrestore(&engine->active.lock, flags);

drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base);
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/i915/gt/intel_engine_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ struct intel_engine_cs {
struct {
spinlock_t lock;
struct list_head requests;
struct list_head hold; /* ready requests, but on hold */
} active;

struct llist_head barrier_tasks;
Expand Down
167 changes: 161 additions & 6 deletions drivers/gpu/drm/i915/gt/intel_lrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1635,8 +1635,8 @@ static void defer_request(struct i915_request *rq, struct list_head * const pl)
!i915_request_completed(rq));

GEM_BUG_ON(i915_request_is_active(w));
if (list_empty(&w->sched.link))
continue; /* Not yet submitted; unready */
if (!i915_request_is_ready(w))
continue;

if (rq_prio(w) < rq_prio(rq))
continue;
Expand Down Expand Up @@ -2354,6 +2354,145 @@ static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
}
}

static void __execlists_hold(struct i915_request *rq)
{
LIST_HEAD(list);

do {
struct i915_dependency *p;

if (i915_request_is_active(rq))
__i915_request_unsubmit(rq);

RQ_TRACE(rq, "on hold\n");
clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
list_move_tail(&rq->sched.link, &rq->engine->active.hold);
i915_request_set_hold(rq);

list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
struct i915_request *w =
container_of(p->waiter, typeof(*w), sched);

/* Leave semaphores spinning on the other engines */
if (w->engine != rq->engine)
continue;

if (!i915_request_is_ready(w))
continue;

if (i915_request_completed(w))
continue;

if (i915_request_on_hold(rq))
continue;

list_move_tail(&w->sched.link, &list);
}

rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
} while (rq);
}

__maybe_unused
static void execlists_hold(struct intel_engine_cs *engine,
struct i915_request *rq)
{
spin_lock_irq(&engine->active.lock);

/*
* Transfer this request onto the hold queue to prevent it
* being resumbitted to HW (and potentially completed) before we have
* released it. Since we may have already submitted following
* requests, we need to remove those as well.
*/
GEM_BUG_ON(i915_request_on_hold(rq));
GEM_BUG_ON(rq->engine != engine);
__execlists_hold(rq);

spin_unlock_irq(&engine->active.lock);
}

static bool hold_request(const struct i915_request *rq)
{
struct i915_dependency *p;

/*
* If one of our ancestors is on hold, we must also be on hold,
* otherwise we will bypass it and execute before it.
*/
list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
const struct i915_request *s =
container_of(p->signaler, typeof(*s), sched);

if (s->engine != rq->engine)
continue;

if (i915_request_on_hold(s))
return true;
}

return false;
}

static void __execlists_unhold(struct i915_request *rq)
{
LIST_HEAD(list);

do {
struct i915_dependency *p;

GEM_BUG_ON(!i915_request_on_hold(rq));
GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));

i915_request_clear_hold(rq);
list_move_tail(&rq->sched.link,
i915_sched_lookup_priolist(rq->engine,
rq_prio(rq)));
set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
RQ_TRACE(rq, "hold release\n");

/* Also release any children on this engine that are ready */
list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
struct i915_request *w =
container_of(p->waiter, typeof(*w), sched);

if (w->engine != rq->engine)
continue;

if (!i915_request_on_hold(rq))
continue;

/* Check that no other parents are also on hold */
if (hold_request(rq))
continue;

list_move_tail(&w->sched.link, &list);
}

rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
} while (rq);
}

__maybe_unused
static void execlists_unhold(struct intel_engine_cs *engine,
struct i915_request *rq)
{
spin_lock_irq(&engine->active.lock);

/*
* Move this request back to the priority queue, and all of its
* children and grandchildren that were suspended along with it.
*/
__execlists_unhold(rq);

if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
engine->execlists.queue_priority_hint = rq_prio(rq);
tasklet_hi_schedule(&engine->execlists.tasklet);
}

spin_unlock_irq(&engine->active.lock);
}

static noinline void preempt_reset(struct intel_engine_cs *engine)
{
const unsigned int bit = I915_RESET_ENGINE + engine->id;
Expand Down Expand Up @@ -2466,6 +2605,13 @@ static void submit_queue(struct intel_engine_cs *engine,
__submit_queue_imm(engine);
}

static bool ancestor_on_hold(const struct intel_engine_cs *engine,
const struct i915_request *rq)
{
GEM_BUG_ON(i915_request_on_hold(rq));
return !list_empty(&engine->active.hold) && hold_request(rq);
}

static void execlists_submit_request(struct i915_request *request)
{
struct intel_engine_cs *engine = request->engine;
Expand All @@ -2474,12 +2620,17 @@ static void execlists_submit_request(struct i915_request *request)
/* Will be called from irq-context when using foreign fences. */
spin_lock_irqsave(&engine->active.lock, flags);

queue_request(engine, request);
if (unlikely(ancestor_on_hold(engine, request))) {
list_add_tail(&request->sched.link, &engine->active.hold);
i915_request_set_hold(request);
} else {
queue_request(engine, request);

GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
GEM_BUG_ON(list_empty(&request->sched.link));
GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
GEM_BUG_ON(list_empty(&request->sched.link));

submit_queue(engine, request);
submit_queue(engine, request);
}

spin_unlock_irqrestore(&engine->active.lock, flags);
}
Expand Down Expand Up @@ -3320,6 +3471,10 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
i915_priolist_free(p);
}

/* On-hold requests will be flushed to timeline upon their release */
list_for_each_entry(rq, &engine->active.hold, sched.link)
mark_eio(rq);

/* Cancel all attached virtual engines */
while ((rb = rb_first_cached(&execlists->virtual))) {
struct virtual_engine *ve =
Expand Down
103 changes: 103 additions & 0 deletions drivers/gpu/drm/i915/gt/selftest_lrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,108 @@ static int live_unlite_preempt(void *arg)
return live_unlite_restore(arg, I915_USER_PRIORITY(I915_PRIORITY_MAX));
}

static int live_hold_reset(void *arg)
{
struct intel_gt *gt = arg;
struct intel_engine_cs *engine;
enum intel_engine_id id;
struct igt_spinner spin;
int err = 0;

/*
* In order to support offline error capture for fast preempt reset,
* we need to decouple the guilty request and ensure that it and its
* descendents are not executed while the capture is in progress.
*/

if (!intel_has_reset_engine(gt))
return 0;

if (igt_spinner_init(&spin, gt))
return -ENOMEM;

for_each_engine(engine, gt, id) {
struct intel_context *ce;
unsigned long heartbeat;
struct i915_request *rq;

ce = intel_context_create(engine);
if (IS_ERR(ce)) {
err = PTR_ERR(ce);
break;
}

engine_heartbeat_disable(engine, &heartbeat);

rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto out;
}
i915_request_add(rq);

if (!igt_wait_for_spinner(&spin, rq)) {
intel_gt_set_wedged(gt);
err = -ETIME;
goto out;
}

/* We have our request executing, now remove it and reset */

if (test_and_set_bit(I915_RESET_ENGINE + id,
&gt->reset.flags)) {
spin_unlock_irq(&engine->active.lock);
intel_gt_set_wedged(gt);
err = -EBUSY;
goto out;
}
tasklet_disable(&engine->execlists.tasklet);

engine->execlists.tasklet.func(engine->execlists.tasklet.data);
GEM_BUG_ON(execlists_active(&engine->execlists) != rq);

execlists_hold(engine, rq);
GEM_BUG_ON(!i915_request_on_hold(rq));

intel_engine_reset(engine, NULL);
GEM_BUG_ON(rq->fence.error != -EIO);

tasklet_enable(&engine->execlists.tasklet);
clear_and_wake_up_bit(I915_RESET_ENGINE + id,
&gt->reset.flags);

/* Check that we do not resubmit the held request */
i915_request_get(rq);
if (!i915_request_wait(rq, 0, HZ / 5)) {
pr_err("%s: on hold request completed!\n",
engine->name);
i915_request_put(rq);
err = -EIO;
goto out;
}
GEM_BUG_ON(!i915_request_on_hold(rq));

/* But is resubmitted on release */
execlists_unhold(engine, rq);
if (i915_request_wait(rq, 0, HZ / 5) < 0) {
pr_err("%s: held request did not complete!\n",
engine->name);
intel_gt_set_wedged(gt);
err = -ETIME;
}
i915_request_put(rq);

out:
engine_heartbeat_enable(engine, heartbeat);
intel_context_put(ce);
if (err)
break;
}

igt_spinner_fini(&spin);
return err;
}

static int
emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx)
{
Expand Down Expand Up @@ -3315,6 +3417,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
SUBTEST(live_sanitycheck),
SUBTEST(live_unlite_switch),
SUBTEST(live_unlite_preempt),
SUBTEST(live_hold_reset),
SUBTEST(live_timeslice_preempt),
SUBTEST(live_timeslice_queue),
SUBTEST(live_busywait_preempt),
Expand Down
Loading

0 comments on commit 32ff621

Please sign in to comment.