Skip to content

Commit

Permalink
drm/i915: Improve user experience and driver robustness under SIGINT …
Browse files Browse the repository at this point in the history
…or similar

We have long standing customer complaints that pressing Ctrl-C (or to the
effect of) causes engine resets with otherwise well behaving programs.

Not only is logging engine resets during normal operation not desirable
since it creates support incidents, but more fundamentally we should avoid
going the engine reset path when we can since any engine reset introduces
a chance of harming an innocent context.

Reason for this undesirable behaviour is that the driver currently does
not distinguish between banned contexts and non-persistent contexts which
have been closed.

To fix this we add the distinction between the two reasons for revoking
contexts, which then allows the strict timeout only be applied to banned,
while innocent contexts (well behaving) can preempt cleanly and exit
without triggering the engine reset path.

Note that the added context exiting category applies both to closed non-
persistent context, and any exiting context when hangcheck has been
disabled by the user.

At the same time we rename the backend operation from 'ban' to 'revoke'
which more accurately describes the actual semantics. (There is no ban at
the backend level since banning is a concept driven by the scheduling
frontend. Backends are simply able to revoke a running context so that
is the more appropriate name chosen.)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220527072452.2225610-1-tvrtko.ursulin@linux.intel.com
  • Loading branch information
Tvrtko Ursulin committed Jun 17, 2022
1 parent 1556c3b commit 45c64ec
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 29 deletions.
23 changes: 15 additions & 8 deletions drivers/gpu/drm/i915/gem/i915_gem_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -1367,7 +1367,8 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
return engine;
}

static void kill_engines(struct i915_gem_engines *engines, bool ban)
static void
kill_engines(struct i915_gem_engines *engines, bool exit, bool persistent)
{
struct i915_gem_engines_iter it;
struct intel_context *ce;
Expand All @@ -1381,9 +1382,15 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)
*/
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
bool skip = false;

if (ban && intel_context_ban(ce, NULL))
continue;
if (exit)
skip = intel_context_set_exiting(ce);
else if (!persistent)
skip = intel_context_exit_nonpersistent(ce, NULL);

if (skip)
continue; /* Already marked. */

/*
* Check the current active state of this context; if we
Expand All @@ -1395,7 +1402,7 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)
engine = active_engine(ce);

/* First attempt to gracefully cancel the context */
if (engine && !__cancel_engine(engine) && ban)
if (engine && !__cancel_engine(engine) && (exit || !persistent))
/*
* If we are unable to send a preemptive pulse to bump
* the context from the GPU, we have to resort to a full
Expand All @@ -1407,8 +1414,6 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)

static void kill_context(struct i915_gem_context *ctx)
{
bool ban = (!i915_gem_context_is_persistent(ctx) ||
!ctx->i915->params.enable_hangcheck);
struct i915_gem_engines *pos, *next;

spin_lock_irq(&ctx->stale.lock);
Expand All @@ -1421,7 +1426,8 @@ static void kill_context(struct i915_gem_context *ctx)

spin_unlock_irq(&ctx->stale.lock);

kill_engines(pos, ban);
kill_engines(pos, !ctx->i915->params.enable_hangcheck,
i915_gem_context_is_persistent(ctx));

spin_lock_irq(&ctx->stale.lock);
GEM_BUG_ON(i915_sw_fence_signaled(&pos->fence));
Expand Down Expand Up @@ -1467,7 +1473,8 @@ static void engines_idle_release(struct i915_gem_context *ctx,

kill:
if (list_empty(&engines->link)) /* raced, already closed */
kill_engines(engines, true);
kill_engines(engines, true,
i915_gem_context_is_persistent(ctx));

i915_sw_fence_commit(&engines->fence);
}
Expand Down
24 changes: 24 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,30 @@ u64 intel_context_get_avg_runtime_ns(struct intel_context *ce)
return avg;
}

bool intel_context_ban(struct intel_context *ce, struct i915_request *rq)
{
bool ret = intel_context_set_banned(ce);

trace_intel_context_ban(ce);

if (ce->ops->revoke)
ce->ops->revoke(ce, rq,
INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS);

return ret;
}

bool intel_context_exit_nonpersistent(struct intel_context *ce,
struct i915_request *rq)
{
bool ret = intel_context_set_exiting(ce);

if (ce->ops->revoke)
ce->ops->revoke(ce, rq, ce->engine->props.preempt_timeout_ms);

return ret;
}

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftest_context.c"
#endif
25 changes: 18 additions & 7 deletions drivers/gpu/drm/i915/gt/intel_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
##__VA_ARGS__); \
} while (0)

#define INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS (1)

struct i915_gem_ww_ctx;

void intel_context_init(struct intel_context *ce,
Expand Down Expand Up @@ -309,18 +311,27 @@ static inline bool intel_context_set_banned(struct intel_context *ce)
return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
}

static inline bool intel_context_ban(struct intel_context *ce,
struct i915_request *rq)
bool intel_context_ban(struct intel_context *ce, struct i915_request *rq);

static inline bool intel_context_is_schedulable(const struct intel_context *ce)
{
bool ret = intel_context_set_banned(ce);
return !test_bit(CONTEXT_EXITING, &ce->flags) &&
!test_bit(CONTEXT_BANNED, &ce->flags);
}

trace_intel_context_ban(ce);
if (ce->ops->ban)
ce->ops->ban(ce, rq);
static inline bool intel_context_is_exiting(const struct intel_context *ce)
{
return test_bit(CONTEXT_EXITING, &ce->flags);
}

return ret;
static inline bool intel_context_set_exiting(struct intel_context *ce)
{
return test_and_set_bit(CONTEXT_EXITING, &ce->flags);
}

bool intel_context_exit_nonpersistent(struct intel_context *ce,
struct i915_request *rq);

static inline bool
intel_context_force_single_submission(const struct intel_context *ce)
{
Expand Down
4 changes: 3 additions & 1 deletion drivers/gpu/drm/i915/gt/intel_context_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ struct intel_context_ops {

int (*alloc)(struct intel_context *ce);

void (*ban)(struct intel_context *ce, struct i915_request *rq);
void (*revoke)(struct intel_context *ce, struct i915_request *rq,
unsigned int preempt_timeout_ms);

int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void **vaddr);
int (*pin)(struct intel_context *ce, void *vaddr);
Expand Down Expand Up @@ -122,6 +123,7 @@ struct intel_context {
#define CONTEXT_GUC_INIT 10
#define CONTEXT_PERMA_PIN 11
#define CONTEXT_IS_PARKING 12
#define CONTEXT_EXITING 13

struct {
u64 timeout_us;
Expand Down
6 changes: 3 additions & 3 deletions drivers/gpu/drm/i915/gt/intel_execlists_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ __execlists_schedule_in(struct i915_request *rq)

if (unlikely(intel_context_is_closed(ce) &&
!intel_engine_has_heartbeat(engine)))
intel_context_set_banned(ce);
intel_context_set_exiting(ce);

if (unlikely(intel_context_is_banned(ce) || bad_request(rq)))
if (unlikely(!intel_context_is_schedulable(ce) || bad_request(rq)))
reset_active(rq, engine);

if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
Expand Down Expand Up @@ -1243,7 +1243,7 @@ static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,

/* Force a fast reset for terminated contexts (ignoring sysfs!) */
if (unlikely(intel_context_is_banned(rq->context) || bad_request(rq)))
return 1;
return INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS;

return READ_ONCE(engine->props.preempt_timeout_ms);
}
Expand Down
7 changes: 4 additions & 3 deletions drivers/gpu/drm/i915/gt/intel_ring_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -598,8 +598,9 @@ static void ring_context_reset(struct intel_context *ce)
clear_bit(CONTEXT_VALID_BIT, &ce->flags);
}

static void ring_context_ban(struct intel_context *ce,
struct i915_request *rq)
static void ring_context_revoke(struct intel_context *ce,
struct i915_request *rq,
unsigned int preempt_timeout_ms)
{
struct intel_engine_cs *engine;

Expand Down Expand Up @@ -634,7 +635,7 @@ static const struct intel_context_ops ring_context_ops = {

.cancel_request = ring_context_cancel_request,

.ban = ring_context_ban,
.revoke = ring_context_revoke,

.pre_pin = ring_context_pre_pin,
.pin = ring_context_pin,
Expand Down
15 changes: 9 additions & 6 deletions drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -2790,7 +2790,9 @@ static void __guc_context_set_preemption_timeout(struct intel_guc *guc,
__guc_context_set_context_policies(guc, &policy, true);
}

static void guc_context_ban(struct intel_context *ce, struct i915_request *rq)
static void
guc_context_revoke(struct intel_context *ce, struct i915_request *rq,
unsigned int preempt_timeout_ms)
{
struct intel_guc *guc = ce_to_guc(ce);
struct intel_runtime_pm *runtime_pm =
Expand Down Expand Up @@ -2829,15 +2831,16 @@ static void guc_context_ban(struct intel_context *ce, struct i915_request *rq)
* gets kicked off the HW ASAP.
*/
with_intel_runtime_pm(runtime_pm, wakeref) {
__guc_context_set_preemption_timeout(guc, guc_id, 1);
__guc_context_set_preemption_timeout(guc, guc_id,
preempt_timeout_ms);
__guc_context_sched_disable(guc, ce, guc_id);
}
} else {
if (!context_guc_id_invalid(ce))
with_intel_runtime_pm(runtime_pm, wakeref)
__guc_context_set_preemption_timeout(guc,
ce->guc_id.id,
1);
preempt_timeout_ms);
spin_unlock_irqrestore(&ce->guc_state.lock, flags);
}
}
Expand Down Expand Up @@ -3190,7 +3193,7 @@ static const struct intel_context_ops guc_context_ops = {
.unpin = guc_context_unpin,
.post_unpin = guc_context_post_unpin,

.ban = guc_context_ban,
.revoke = guc_context_revoke,

.cancel_request = guc_context_cancel_request,

Expand Down Expand Up @@ -3439,7 +3442,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
.unpin = guc_virtual_context_unpin,
.post_unpin = guc_context_post_unpin,

.ban = guc_context_ban,
.revoke = guc_context_revoke,

.cancel_request = guc_context_cancel_request,

Expand Down Expand Up @@ -3528,7 +3531,7 @@ static const struct intel_context_ops virtual_parent_context_ops = {
.unpin = guc_parent_context_unpin,
.post_unpin = guc_context_post_unpin,

.ban = guc_context_ban,
.revoke = guc_context_revoke,

.cancel_request = guc_context_cancel_request,

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/i915_request.c
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ bool __i915_request_submit(struct i915_request *request)
goto active;
}

if (unlikely(intel_context_is_banned(request->context)))
if (unlikely(!intel_context_is_schedulable(request->context)))
i915_request_set_error_once(request, -EIO);

if (unlikely(fatal_error(request->fence.error)))
Expand Down

0 comments on commit 45c64ec

Please sign in to comment.