Skip to content

Commit

Permalink
drm/i915/ttm: handle blitter failure on DG2
Browse files Browse the repository at this point in the history
If the move or clear operation somehow fails, and the memory underneath
is not cleared, like when moving to lmem, then we currently fallback to
memcpy or memset. However with small-BAR systems this fallback might no
longer be possible. For now we use the set_wedged sledgehammer if we
ever encounter such a scenario, and mark the object as borked to plug
any holes where access to the memory underneath can happen. Add some
basic selftests to exercise this.

v2:
  - In the selftests make sure we grab the runtime pm around the reset.
    Also make sure we grab the reset lock before checking if the device
    is wedged, since the wedge might still be in-progress and hence the
    bit might not be set yet.
  - Don't wedge or put the object into an unknown state, if the request
    construction fails (or similar). Just returning an error and
    skipping the fallback should be safe here.
  - Make sure we wedge each gt. (Thomas)
  - Peek at the unknown_state in io_reserve, that way we don't have to
    export or hand roll the fault_wait_for_idle. (Thomas)
  - Add the missing read-side barriers for the unknown_state. (Thomas)
  - Some kernel-doc fixes. (Thomas)
v3:
  - Tweak the ordering of the set_wedged, also add FIXME.

Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220629174350.384910-11-matthew.auld@intel.com
  • Loading branch information
Matthew Auld committed Jul 1, 2022
1 parent 11f01dc commit bfe53be
Show file tree
Hide file tree
Showing 10 changed files with 353 additions and 48 deletions.
21 changes: 21 additions & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -783,10 +783,31 @@ int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
intr, MAX_SCHEDULE_TIMEOUT);
if (!ret)
ret = -ETIME;
else if (ret > 0 && i915_gem_object_has_unknown_state(obj))
ret = -EIO;

return ret < 0 ? ret : 0;
}

/**
* i915_gem_object_has_unknown_state - Return true if the object backing pages are
* in an unknown_state. This means that userspace must NEVER be allowed to touch
* the pages, with either the GPU or CPU.
*
* ONLY valid to be called after ensuring that all kernel fences have signalled
* (in particular the fence for moving/clearing the object).
*/
bool i915_gem_object_has_unknown_state(struct drm_i915_gem_object *obj)
{
/*
* The below barrier pairs with the dma_fence_signal() in
* __memcpy_work(). We should only sample the unknown_state after all
* the kernel fences have signalled.
*/
smp_rmb();
return obj->mm.unknown_state;
}

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftests/huge_gem_object.c"
#include "selftests/huge_pages.c"
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ int i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj,
struct dma_fence **fence);
int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
bool intr);
bool i915_gem_object_has_unknown_state(struct drm_i915_gem_object *obj);

void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj,
unsigned int cache_level);
Expand Down
18 changes: 18 additions & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_object_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,24 @@ struct drm_i915_gem_object {
*/
bool ttm_shrinkable;

/**
* @unknown_state: Indicate that the object is effectively
* borked. This is write-once and set if we somehow encounter a
* fatal error when moving/clearing the pages, and we are not
* able to fallback to memcpy/memset, like on small-BAR systems.
* The GPU should also be wedged (or in the process) at this
* point.
*
* Only valid to read this after acquiring the dma-resv lock and
* waiting for all DMA_RESV_USAGE_KERNEL fences to be signalled,
* or if we otherwise know that the moving fence has signalled,
* and we are certain the pages underneath are valid for
* immediate access (under normal operation), like just prior to
* binding the object or when setting up the CPU fault handler.
* See i915_gem_object_has_unknown_state();
*/
bool unknown_state;

/**
* Priority list of potential placements for this object.
*/
Expand Down
26 changes: 25 additions & 1 deletion drivers/gpu/drm/i915/gem/i915_gem_ttm.c
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,15 @@ static void i915_ttm_swap_notify(struct ttm_buffer_object *bo)
i915_ttm_purge(obj);
}

static bool i915_ttm_resource_mappable(struct ttm_resource *res)
/**
* i915_ttm_resource_mappable - Return true if the ttm resource is CPU
* accessible.
* @res: The TTM resource to check.
*
* This is interesting on small-BAR systems where we may encounter lmem objects
* that can't be accessed via the CPU.
*/
bool i915_ttm_resource_mappable(struct ttm_resource *res)
{
struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res);

Expand All @@ -687,6 +695,22 @@ static bool i915_ttm_resource_mappable(struct ttm_resource *res)

static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem)
{
struct drm_i915_gem_object *obj = i915_ttm_to_gem(mem->bo);
bool unknown_state;

if (!obj)
return -EINVAL;

if (!kref_get_unless_zero(&obj->base.refcount))
return -EINVAL;

assert_object_held(obj);

unknown_state = i915_gem_object_has_unknown_state(obj);
i915_gem_object_put(obj);
if (unknown_state)
return -EINVAL;

if (!i915_ttm_cpu_maps_iomem(mem))
return 0;

Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_ttm.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,7 @@ static inline bool i915_ttm_cpu_maps_iomem(struct ttm_resource *mem)
/* Once / if we support GGTT, this is also false for cached ttm_tts */
return mem->mem_type != I915_PL_SYSTEM;
}

bool i915_ttm_resource_mappable(struct ttm_resource *res);

#endif
96 changes: 83 additions & 13 deletions drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,19 @@
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
static bool fail_gpu_migration;
static bool fail_work_allocation;
static bool ban_memcpy;

void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
bool work_allocation)
{
fail_gpu_migration = gpu_migration;
fail_work_allocation = work_allocation;
}

void i915_ttm_migrate_set_ban_memcpy(bool ban)
{
ban_memcpy = ban;
}
#endif

static enum i915_cache_level
Expand Down Expand Up @@ -258,15 +264,23 @@ struct i915_ttm_memcpy_arg {
* from the callback for lockdep reasons.
* @cb: Callback for the accelerated migration fence.
* @arg: The argument for the memcpy functionality.
* @i915: The i915 pointer.
* @obj: The GEM object.
* @memcpy_allowed: Instead of processing the @arg, and falling back to memcpy
* or memset, we wedge the device and set the @obj unknown_state, to prevent
* further access to the object with the CPU or GPU. On some devices we might
* only be permitted to use the blitter engine for such operations.
*/
struct i915_ttm_memcpy_work {
struct dma_fence fence;
struct work_struct work;
/* The fence lock */
spinlock_t lock;
struct irq_work irq_work;
struct dma_fence_cb cb;
struct i915_ttm_memcpy_arg arg;
struct drm_i915_private *i915;
struct drm_i915_gem_object *obj;
bool memcpy_allowed;
};

static void i915_ttm_move_memcpy(struct i915_ttm_memcpy_arg *arg)
Expand Down Expand Up @@ -317,14 +331,42 @@ static void __memcpy_work(struct work_struct *work)
struct i915_ttm_memcpy_work *copy_work =
container_of(work, typeof(*copy_work), work);
struct i915_ttm_memcpy_arg *arg = &copy_work->arg;
bool cookie = dma_fence_begin_signalling();
bool cookie;

/*
* FIXME: We need to take a closer look here. We should be able to plonk
* this into the fence critical section.
*/
if (!copy_work->memcpy_allowed) {
struct intel_gt *gt;
unsigned int id;

for_each_gt(gt, copy_work->i915, id)
intel_gt_set_wedged(gt);
}

cookie = dma_fence_begin_signalling();

if (copy_work->memcpy_allowed) {
i915_ttm_move_memcpy(arg);
} else {
/*
* Prevent further use of the object. Any future GTT binding or
* CPU access is not allowed once we signal the fence. Outside
* of the fence critical section, we then also then wedge the gpu
* to indicate the device is not functional.
*
* The below dma_fence_signal() is our write-memory-barrier.
*/
copy_work->obj->mm.unknown_state = true;
}

i915_ttm_move_memcpy(arg);
dma_fence_end_signalling(cookie);

dma_fence_signal(&copy_work->fence);

i915_ttm_memcpy_release(arg);
i915_gem_object_put(copy_work->obj);
dma_fence_put(&copy_work->fence);
}

Expand All @@ -336,6 +378,7 @@ static void __memcpy_irq_work(struct irq_work *irq_work)

dma_fence_signal(&copy_work->fence);
i915_ttm_memcpy_release(arg);
i915_gem_object_put(copy_work->obj);
dma_fence_put(&copy_work->fence);
}

Expand Down Expand Up @@ -389,13 +432,26 @@ i915_ttm_memcpy_work_arm(struct i915_ttm_memcpy_work *work,
return &work->fence;
}

static bool i915_ttm_memcpy_allowed(struct ttm_buffer_object *bo,
struct ttm_resource *dst_mem)
{
if (!(i915_ttm_resource_mappable(bo->resource) &&
i915_ttm_resource_mappable(dst_mem)))
return false;

return I915_SELFTEST_ONLY(ban_memcpy) ? false : true;
}

static struct dma_fence *
__i915_ttm_move(struct ttm_buffer_object *bo,
const struct ttm_operation_ctx *ctx, bool clear,
struct ttm_resource *dst_mem, struct ttm_tt *dst_ttm,
struct i915_refct_sgt *dst_rsgt, bool allow_accel,
const struct i915_deps *move_deps)
{
const bool memcpy_allowed = i915_ttm_memcpy_allowed(bo, dst_mem);
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
struct drm_i915_private *i915 = to_i915(bo->base.dev);
struct i915_ttm_memcpy_work *copy_work = NULL;
struct i915_ttm_memcpy_arg _arg, *arg = &_arg;
struct dma_fence *fence = ERR_PTR(-EINVAL);
Expand Down Expand Up @@ -423,9 +479,14 @@ __i915_ttm_move(struct ttm_buffer_object *bo,
copy_work = kzalloc(sizeof(*copy_work), GFP_KERNEL);

if (copy_work) {
copy_work->i915 = i915;
copy_work->memcpy_allowed = memcpy_allowed;
copy_work->obj = i915_gem_object_get(obj);
arg = &copy_work->arg;
i915_ttm_memcpy_init(arg, bo, clear, dst_mem, dst_ttm,
dst_rsgt);
if (memcpy_allowed)
i915_ttm_memcpy_init(arg, bo, clear, dst_mem,
dst_ttm, dst_rsgt);

fence = i915_ttm_memcpy_work_arm(copy_work, dep);
} else {
dma_fence_wait(dep, false);
Expand All @@ -450,17 +511,23 @@ __i915_ttm_move(struct ttm_buffer_object *bo,
}

/* Error intercept failed or no accelerated migration to start with */
if (!copy_work)
i915_ttm_memcpy_init(arg, bo, clear, dst_mem, dst_ttm,
dst_rsgt);
i915_ttm_move_memcpy(arg);
i915_ttm_memcpy_release(arg);

if (memcpy_allowed) {
if (!copy_work)
i915_ttm_memcpy_init(arg, bo, clear, dst_mem, dst_ttm,
dst_rsgt);
i915_ttm_move_memcpy(arg);
i915_ttm_memcpy_release(arg);
}
if (copy_work)
i915_gem_object_put(copy_work->obj);
kfree(copy_work);

return NULL;
return memcpy_allowed ? NULL : ERR_PTR(-EIO);
out:
if (!fence && copy_work) {
i915_ttm_memcpy_release(arg);
i915_gem_object_put(copy_work->obj);
kfree(copy_work);
}

Expand Down Expand Up @@ -539,8 +606,11 @@ int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
}

if (migration_fence) {
ret = ttm_bo_move_accel_cleanup(bo, migration_fence, evict,
true, dst_mem);
if (I915_SELFTEST_ONLY(evict && fail_gpu_migration))
ret = -EIO; /* never feed non-migrate fences into ttm */
else
ret = ttm_bo_move_accel_cleanup(bo, migration_fence, evict,
true, dst_mem);
if (ret) {
dma_fence_wait(migration_fence, false);
ttm_bo_move_sync_cleanup(bo, dst_mem);
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ int i915_ttm_move_notify(struct ttm_buffer_object *bo);

I915_SELFTEST_DECLARE(void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
bool work_allocation));
I915_SELFTEST_DECLARE(void i915_ttm_migrate_set_ban_memcpy(bool ban));

int i915_gem_obj_copy_ttm(struct drm_i915_gem_object *dst,
struct drm_i915_gem_object *src,
Expand Down
Loading

0 comments on commit bfe53be

Please sign in to comment.