Skip to content

Commit

Permalink
drm/i915/gt: Expose heartbeat interval via sysfs
Browse files Browse the repository at this point in the history
We monitor the health of the system via periodic heartbeat pulses. The
pulses also provide the opportunity to perform garbage collection.
However, we interpret an incomplete pulse (a missed heartbeat) as an
indication that the system is no longer responsive, i.e. hung, and
perform an engine or full GPU reset. Given that the preemption
granularity can be very coarse on a system, we let the sysadmin override
our legacy timeouts which were "optimised" for desktop applications.

The heartbeat interval can be adjusted per-engine using,

	/sys/class/drm/card?/engine/*/heartbeat_interval_ms

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Steve Carbonari <steven.carbonari@intel.com>
Tested-by: Steve Carbonari <steven.carbonari@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200228131716.3243616-7-chris@chris-wilson.co.uk
  • Loading branch information
Chris Wilson committed Feb 28, 2020
1 parent db3d833 commit 9a40bdd
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 0 deletions.
3 changes: 3 additions & 0 deletions drivers/gpu/drm/i915/Kconfig.profile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ config DRM_I915_HEARTBEAT_INTERVAL
check the health of the GPU and undertake regular house-keeping of
internal driver state.

This is adjustable via
/sys/class/drm/card?/engine/*/heartbeat_interval_ms

May be 0 to disable heartbeats and therefore disable automatic GPU
hang detection.

Expand Down
47 changes: 47 additions & 0 deletions drivers/gpu/drm/i915/gt/sysfs_engines.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "i915_drv.h"
#include "intel_engine.h"
#include "intel_engine_heartbeat.h"
#include "sysfs_engines.h"

struct kobj_engine {
Expand Down Expand Up @@ -315,6 +316,49 @@ preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr,
static struct kobj_attribute preempt_timeout_attr =
__ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, preempt_timeout_store);

static ssize_t
heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
struct intel_engine_cs *engine = kobj_to_engine(kobj);
unsigned long long delay;
int err;

/*
* We monitor the health of the system via periodic heartbeat pulses.
* The pulses also provide the opportunity to perform garbage
* collection. However, we interpret an incomplete pulse (a missed
* heartbeat) as an indication that the system is no longer responsive,
* i.e. hung, and perform an engine or full GPU reset. Given that the
* preemption granularity can be very coarse on a system, the optimal
* value for any workload is unknowable!
*/

err = kstrtoull(buf, 0, &delay);
if (err)
return err;

if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
return -EINVAL;

err = intel_engine_set_heartbeat(engine, delay);
if (err)
return err;

return count;
}

static ssize_t
heartbeat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct intel_engine_cs *engine = kobj_to_engine(kobj);

return sprintf(buf, "%lu\n", engine->props.heartbeat_interval_ms);
}

static struct kobj_attribute heartbeat_interval_attr =
__ATTR(heartbeat_interval_ms, 0644, heartbeat_show, heartbeat_store);

static void kobj_engine_release(struct kobject *kobj)
{
kfree(kobj);
Expand Down Expand Up @@ -357,6 +401,9 @@ void intel_engines_add_sysfs(struct drm_i915_private *i915)
&all_caps_attr.attr,
&max_spin_attr.attr,
&stop_timeout_attr.attr,
#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL
&heartbeat_interval_attr.attr,
#endif
NULL
};

Expand Down

0 comments on commit 9a40bdd

Please sign in to comment.