Skip to content

Commit

Permalink
drm/i915: Allow userspace to request no-error-capture upon GPU hangs
Browse files Browse the repository at this point in the history
igt likes to inject GPU hangs into its command streams. However, as we
expect these hangs, we don't actually want them recorded in the dmesg
output or stored in the i915_error_state (usually). To accommodate this
allow userspace to set a flag on the context that any hang emanating
from that context will not be recorded. We still do the error capture
(otherwise how do we find the guilty context and know its intent?) as
part of the reason for random GPU hang injection is to exercise the race
conditions between the error capture and normal execution.

v2: Split out the request->ringbuf error capture changes.
v3: Move the flag defines next to the intel_context->flags definition

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Dave Gordon <david.s.gordon@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1467616119-4093-9-git-send-email-chris@chris-wilson.co.uk
  • Loading branch information
Chris Wilson committed Jul 4, 2016
1 parent ba6e041 commit bc3d674
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 9 deletions.
4 changes: 3 additions & 1 deletion drivers/gpu/drm/i915/i915_drv.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ struct drm_i915_error_state {
struct timeval time;

char error_msg[128];
bool simulated;
int iommu;
u32 reset_count;
u32 suspend_count;
Expand Down Expand Up @@ -875,9 +876,10 @@ struct i915_gem_context {

/* Unique identifier for this context, used by the hw for tracking */
unsigned long flags;
#define CONTEXT_NO_ZEROMAP BIT(0)
#define CONTEXT_NO_ERROR_CAPTURE BIT(1)
unsigned hw_id;
u32 user_handle;
#define CONTEXT_NO_ZEROMAP (1<<0)

u32 ggtt_alignment;

Expand Down
13 changes: 13 additions & 0 deletions drivers/gpu/drm/i915/i915_gem_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
else
args->value = to_i915(dev)->ggtt.base.total;
break;
case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
break;
default:
ret = -EINVAL;
break;
Expand Down Expand Up @@ -1071,6 +1074,16 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
ctx->flags |= args->value ? CONTEXT_NO_ZEROMAP : 0;
}
break;
case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
if (args->size) {
ret = -EINVAL;
} else {
if (args->value)
ctx->flags |= CONTEXT_NO_ERROR_CAPTURE;
else
ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
}
break;
default:
ret = -EINVAL;
break;
Expand Down
20 changes: 12 additions & 8 deletions drivers/gpu/drm/i915/i915_gpu_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -1093,9 +1093,8 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
struct i915_address_space *vm;
struct intel_ringbuffer *rb;

vm = request->ctx && request->ctx->ppgtt ?
&request->ctx->ppgtt->base :
&ggtt->base;
vm = request->ctx->ppgtt ?
&request->ctx->ppgtt->base : &ggtt->base;

/* We need to copy these to an anonymous buffer
* as the simplest method to avoid being overwritten
Expand Down Expand Up @@ -1123,6 +1122,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
rcu_read_unlock();
}

error->simulated |=
request->ctx->flags & CONTEXT_NO_ERROR_CAPTURE;

rb = request->ringbuf;
error->ring[i].cpu_ring_head = rb->head;
error->ring[i].cpu_ring_tail = rb->tail;
Expand Down Expand Up @@ -1422,12 +1424,14 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
DRM_INFO("%s\n", error->error_msg);

spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
if (dev_priv->gpu_error.first_error == NULL) {
dev_priv->gpu_error.first_error = error;
error = NULL;
if (!error->simulated) {
spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
if (!dev_priv->gpu_error.first_error) {
dev_priv->gpu_error.first_error = error;
error = NULL;
}
spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
}
spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);

if (error) {
i915_error_state_free(&error->ref);
Expand Down
1 change: 1 addition & 0 deletions include/uapi/drm/i915_drm.h
Original file line number Diff line number Diff line change
Expand Up @@ -1173,6 +1173,7 @@ struct drm_i915_gem_context_param {
#define I915_CONTEXT_PARAM_BAN_PERIOD 0x1
#define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2
#define I915_CONTEXT_PARAM_GTT_SIZE 0x3
#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4
__u64 value;
};

Expand Down

0 comments on commit bc3d674

Please sign in to comment.