Skip to content

Commit

Permalink
drm/i915/gt: Replace hangcheck by heartbeats
Browse files Browse the repository at this point in the history
Replace sampling the engine state every so often with a periodic
heartbeat request to measure the health of an engine. This is coupled
with the forced-preemption to allow long running requests to survive so
long as they do not block other users.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Reviewed-by: Jon Bloomfield <jon.bloomfield@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191023133108.21401-5-chris@chris-wilson.co.uk
  • Loading branch information
Chris Wilson committed Oct 23, 2019
1 parent 2e0986a commit 058179e
Show file tree
Hide file tree
Showing 25 changed files with 379 additions and 555 deletions.
11 changes: 11 additions & 0 deletions drivers/gpu/drm/i915/Kconfig.profile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@ config DRM_I915_USERFAULT_AUTOSUSPEND
May be 0 to disable the extra delay and solely use the device level
runtime pm autosuspend delay tunable.

config DRM_I915_HEARTBEAT_INTERVAL
int "Interval between heartbeat pulses (ms)"
default 2500 # milliseconds
help
The driver sends a periodic heartbeat down all active engines to
check the health of the GPU and undertake regular house-keeping of
internal driver state.

May be 0 to disable heartbeats and therefore disable automatic GPU
hang detection.

config DRM_I915_PREEMPT_TIMEOUT
int "Preempt timeout (ms, jiffy granularity)"
default 100 # milliseconds
Expand Down
1 change: 0 additions & 1 deletion drivers/gpu/drm/i915/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ gt-y += \
gt/intel_gt_pm.o \
gt/intel_gt_pm_irq.o \
gt/intel_gt_requests.o \
gt/intel_hangcheck.o \
gt/intel_llc.o \
gt/intel_lrc.o \
gt/intel_rc6.o \
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/display/intel_display.c
Original file line number Diff line number Diff line change
Expand Up @@ -14863,7 +14863,7 @@ static void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state)
static void fb_obj_bump_render_priority(struct drm_i915_gem_object *obj)
{
struct i915_sched_attr attr = {
.priority = I915_PRIORITY_DISPLAY,
.priority = I915_USER_PRIORITY(I915_PRIORITY_DISPLAY),
};

i915_gem_object_wait_priority(obj, 0, &attr);
Expand Down
1 change: 0 additions & 1 deletion drivers/gpu/drm/i915/gem/i915_gem_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,5 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
unsigned int flags,
const struct i915_sched_attr *attr);
#define I915_PRIORITY_DISPLAY I915_USER_PRIORITY(I915_PRIORITY_MAX)

#endif
2 changes: 0 additions & 2 deletions drivers/gpu/drm/i915/gem/i915_gem_pm.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ void i915_gem_suspend(struct drm_i915_private *i915)
intel_gt_suspend(&i915->gt);
intel_uc_suspend(&i915->gt.uc);

cancel_delayed_work_sync(&i915->gt.hangcheck.work);

i915_gem_drain_freed_objects(i915);
}

Expand Down
32 changes: 0 additions & 32 deletions drivers/gpu/drm/i915/gt/intel_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,38 +91,6 @@ struct intel_gt;
/* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to
* do the writes, and that must have qw aligned offsets, simply pretend it's 8b.
*/
enum intel_engine_hangcheck_action {
ENGINE_IDLE = 0,
ENGINE_WAIT,
ENGINE_ACTIVE_SEQNO,
ENGINE_ACTIVE_HEAD,
ENGINE_ACTIVE_SUBUNITS,
ENGINE_WAIT_KICK,
ENGINE_DEAD,
};

static inline const char *
hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
{
switch (a) {
case ENGINE_IDLE:
return "idle";
case ENGINE_WAIT:
return "wait";
case ENGINE_ACTIVE_SEQNO:
return "active seqno";
case ENGINE_ACTIVE_HEAD:
return "active head";
case ENGINE_ACTIVE_SUBUNITS:
return "active subunits";
case ENGINE_WAIT_KICK:
return "wait kick";
case ENGINE_DEAD:
return "dead";
}

return "unknown";
}

static inline unsigned int
execlists_num_ports(const struct intel_engine_execlists * const execlists)
Expand Down
12 changes: 9 additions & 3 deletions drivers/gpu/drm/i915/gt/intel_engine_cs.c
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,8 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
engine->instance = info->instance;
__sprint_engine_name(engine);

engine->props.heartbeat_interval_ms =
CONFIG_DRM_I915_HEARTBEAT_INTERVAL;
engine->props.preempt_timeout_ms =
CONFIG_DRM_I915_PREEMPT_TIMEOUT;
engine->props.stop_timeout_ms =
Expand Down Expand Up @@ -609,7 +611,6 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
intel_engine_init_active(engine, ENGINE_PHYSICAL);
intel_engine_init_breadcrumbs(engine);
intel_engine_init_execlists(engine);
intel_engine_init_hangcheck(engine);
intel_engine_init_cmd_parser(engine);
intel_engine_init__pm(engine);

Expand Down Expand Up @@ -1470,8 +1471,13 @@ void intel_engine_dump(struct intel_engine_cs *engine,
drm_printf(m, "*** WEDGED ***\n");

drm_printf(m, "\tAwake? %d\n", atomic_read(&engine->wakeref.count));
drm_printf(m, "\tHangcheck: %d ms ago\n",
jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));

rcu_read_lock();
rq = READ_ONCE(engine->heartbeat.systole);
if (rq)
drm_printf(m, "\tHeartbeat: %d ms ago\n",
jiffies_to_msecs(jiffies - rq->emitted_jiffies));
rcu_read_unlock();
drm_printf(m, "\tReset count: %d (global %d)\n",
i915_reset_engine_count(error, engine),
i915_reset_count(error));
Expand Down
157 changes: 157 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,170 @@
#include "intel_engine_pm.h"
#include "intel_engine.h"
#include "intel_gt.h"
#include "intel_reset.h"

/*
* While the engine is active, we send a periodic pulse along the engine
* to check on its health and to flush any idle-barriers. If that request
* is stuck, and we fail to preempt it, we declare the engine hung and
* issue a reset -- in the hope that restores progress.
*/

static bool next_heartbeat(struct intel_engine_cs *engine)
{
long delay;

delay = READ_ONCE(engine->props.heartbeat_interval_ms);
if (!delay)
return false;

delay = msecs_to_jiffies_timeout(delay);
if (delay >= HZ)
delay = round_jiffies_up_relative(delay);
schedule_delayed_work(&engine->heartbeat.work, delay);

return true;
}

static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
{
engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
i915_request_add_active_barriers(rq);
}

static void show_heartbeat(const struct i915_request *rq,
struct intel_engine_cs *engine)
{
struct drm_printer p = drm_debug_printer("heartbeat");

intel_engine_dump(engine, &p,
"%s heartbeat {prio:%d} not ticking\n",
engine->name,
rq->sched.attr.priority);
}

static void heartbeat(struct work_struct *wrk)
{
struct i915_sched_attr attr = {
.priority = I915_USER_PRIORITY(I915_PRIORITY_MIN),
};
struct intel_engine_cs *engine =
container_of(wrk, typeof(*engine), heartbeat.work.work);
struct intel_context *ce = engine->kernel_context;
struct i915_request *rq;

if (!intel_engine_pm_get_if_awake(engine))
return;

rq = engine->heartbeat.systole;
if (rq && i915_request_completed(rq)) {
i915_request_put(rq);
engine->heartbeat.systole = NULL;
}

if (intel_gt_is_wedged(engine->gt))
goto out;

if (engine->heartbeat.systole) {
if (engine->schedule &&
rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
/*
* Gradually raise the priority of the heartbeat to
* give high priority work [which presumably desires
* low latency and no jitter] the chance to naturally
* complete before being preempted.
*/
attr.priority = I915_PRIORITY_MASK;
if (rq->sched.attr.priority >= attr.priority)
attr.priority |= I915_USER_PRIORITY(I915_PRIORITY_HEARTBEAT);
if (rq->sched.attr.priority >= attr.priority)
attr.priority = I915_PRIORITY_BARRIER;

local_bh_disable();
engine->schedule(rq, &attr);
local_bh_enable();
} else {
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
show_heartbeat(rq, engine);

intel_gt_handle_error(engine->gt, engine->mask,
I915_ERROR_CAPTURE,
"stopped heartbeat on %s",
engine->name);
}
goto out;
}

if (engine->wakeref_serial == engine->serial)
goto out;

mutex_lock(&ce->timeline->mutex);

intel_context_enter(ce);
rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
intel_context_exit(ce);
if (IS_ERR(rq))
goto unlock;

idle_pulse(engine, rq);
if (i915_modparams.enable_hangcheck)
engine->heartbeat.systole = i915_request_get(rq);

__i915_request_commit(rq);
__i915_request_queue(rq, &attr);

unlock:
mutex_unlock(&ce->timeline->mutex);
out:
if (!next_heartbeat(engine))
i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
intel_engine_pm_put(engine);
}

void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
{
if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
return;

next_heartbeat(engine);
}

void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
{
cancel_delayed_work(&engine->heartbeat.work);
i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
}

void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
{
INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
}

int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
unsigned long delay)
{
int err;

/* Send one last pulse before to cleanup persistent hogs */
if (!delay && CONFIG_DRM_I915_PREEMPT_TIMEOUT) {
err = intel_engine_pulse(engine);
if (err)
return err;
}

WRITE_ONCE(engine->props.heartbeat_interval_ms, delay);

if (intel_engine_pm_get_if_awake(engine)) {
if (delay)
intel_engine_unpark_heartbeat(engine);
else
intel_engine_park_heartbeat(engine);
intel_engine_pm_put(engine);
}

return 0;
}

int intel_engine_pulse(struct intel_engine_cs *engine)
{
struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
Expand Down
8 changes: 8 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@

struct intel_engine_cs;

void intel_engine_init_heartbeat(struct intel_engine_cs *engine);

int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
unsigned long delay);

void intel_engine_park_heartbeat(struct intel_engine_cs *engine);
void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine);

int intel_engine_pulse(struct intel_engine_cs *engine);
int intel_engine_flush_barriers(struct intel_engine_cs *engine);

Expand Down
5 changes: 4 additions & 1 deletion drivers/gpu/drm/i915/gt/intel_engine_pm.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "i915_drv.h"

#include "intel_engine.h"
#include "intel_engine_heartbeat.h"
#include "intel_engine_pm.h"
#include "intel_engine_pool.h"
#include "intel_gt.h"
Expand Down Expand Up @@ -34,7 +35,7 @@ static int __engine_unpark(struct intel_wakeref *wf)
if (engine->unpark)
engine->unpark(engine);

intel_engine_init_hangcheck(engine);
intel_engine_unpark_heartbeat(engine);
return 0;
}

Expand Down Expand Up @@ -158,6 +159,7 @@ static int __engine_park(struct intel_wakeref *wf)

call_idle_barriers(engine); /* cleanup after wedging */

intel_engine_park_heartbeat(engine);
intel_engine_disarm_breadcrumbs(engine);
intel_engine_pool_park(&engine->pool);

Expand Down Expand Up @@ -188,6 +190,7 @@ void intel_engine_init__pm(struct intel_engine_cs *engine)
struct intel_runtime_pm *rpm = engine->uncore->rpm;

intel_wakeref_init(&engine->wakeref, rpm, &wf_ops);
intel_engine_init_heartbeat(engine);
}

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
Expand Down
17 changes: 7 additions & 10 deletions drivers/gpu/drm/i915/gt/intel_engine_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <linux/rbtree.h>
#include <linux/timer.h>
#include <linux/types.h>
#include <linux/workqueue.h>

#include "i915_gem.h"
#include "i915_pmu.h"
Expand Down Expand Up @@ -76,14 +77,6 @@ struct intel_instdone {
u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES];
};

struct intel_engine_hangcheck {
u64 acthd;
u32 last_ring;
u32 last_head;
unsigned long action_timestamp;
struct intel_instdone instdone;
};

struct intel_ring {
struct kref ref;
struct i915_vma *vma;
Expand Down Expand Up @@ -331,6 +324,11 @@ struct intel_engine_cs {

intel_engine_mask_t saturated; /* submitting semaphores too late? */

struct {
struct delayed_work work;
struct i915_request *systole;
} heartbeat;

unsigned long serial;

unsigned long wakeref_serial;
Expand Down Expand Up @@ -481,8 +479,6 @@ struct intel_engine_cs {
/* status_notifier: list of callbacks for context-switch changes */
struct atomic_notifier_head context_status_notifier;

struct intel_engine_hangcheck hangcheck;

#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
#define I915_ENGINE_SUPPORTS_STATS BIT(1)
#define I915_ENGINE_HAS_PREEMPTION BIT(2)
Expand Down Expand Up @@ -549,6 +545,7 @@ struct intel_engine_cs {
} stats;

struct {
unsigned long heartbeat_interval_ms;
unsigned long preempt_timeout_ms;
unsigned long stop_timeout_ms;
} props;
Expand Down
Loading

0 comments on commit 058179e

Please sign in to comment.