Skip to content

Commit

Permalink
Merge tag 'drm-intel-gt-next-2024-06-12' of https://gitlab.freedeskto…
Browse files Browse the repository at this point in the history
…p.org/drm/i915/kernel into drm-next

UAPI Changes:

- Support replaying GPU hangs with captured context image (Tvrtko Ursulin)

Driver Changes:

Fixes/improvements/new stuff:

- Automate CCS Mode setting during engine resets [gt] (Andi Shyti)
- Revert "drm/i915: Remove extra multi-gt pm-references" (Janusz Krzysztofik)
- Fix HAS_REGION() usage in intel_gt_probe_lmem() (Ville Syrjälä)
- Disarm breadcrumbs if engines are already idle [gt] (Chris Wilson)
- Shadow default engine context image in the context (Tvrtko Ursulin)
- Support replaying GPU hangs with captured context image (Tvrtko Ursulin)
- avoid FIELD_PREP warning [guc] (Arnd Bergmann)
- Fix CCS id's calculation for CCS mode setting [gt] (Andi Shyti)
- Increase FLR timeout from 3s to 9s (Andi Shyti)
- Update workaround 14018575942 [mtl] (Angus Chen)

Future platform enablement:

- Enable w/a 16021333562 for DG2, MTL and ARL [guc] (John Harrison)

Miscellaneous:

- Pass the region ID rather than a bitmask to HAS_REGION() (Ville Syrjälä)
- Remove counter productive REGION_* wrappers (Ville Syrjälä)
- Fix typo [gem/i915_gem_ttm_move] (Deming Wang)
- Delete the live_hearbeat_fast selftest [gt] (Krzysztof Niemiec)

Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Tvrtko Ursulin <tursulin@igalia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/Zmmazub+U9ewH9ts@linux
  • Loading branch information
Dave Airlie committed Jun 27, 2024
2 parents 365aa9f + 79655e8 commit a78313b
Show file tree
Hide file tree
Showing 21 changed files with 246 additions and 143 deletions.
17 changes: 17 additions & 0 deletions drivers/gpu/drm/i915/Kconfig.debug
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@ config DRM_I915_WERROR

If in doubt, say "N".

config DRM_I915_REPLAY_GPU_HANGS_API
bool "Enable GPU hang replay userspace API"
depends on DRM_I915
depends on EXPERT
default n
help
Choose this option if you want to enable special and unstable
userspace API used for replaying GPU hangs on a running system.

This API is intended to be used by userspace graphics stack developers
and provides no stability guarantees.

The API needs to be activated at boot time using the
enable_debug_only_api module parameter.

If in doubt, say "N".

config DRM_I915_DEBUG
bool "Enable additional driver debugging"
depends on DRM_I915
Expand Down
113 changes: 113 additions & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
#include "gt/intel_engine_user.h"
#include "gt/intel_gpu_commands.h"
#include "gt/intel_ring.h"
#include "gt/shmem_utils.h"

#include "pxp/intel_pxp.h"

Expand Down Expand Up @@ -957,6 +958,7 @@ static int set_proto_ctx_param(struct drm_i915_file_private *fpriv,
case I915_CONTEXT_PARAM_NO_ZEROMAP:
case I915_CONTEXT_PARAM_BAN_PERIOD:
case I915_CONTEXT_PARAM_RINGSIZE:
case I915_CONTEXT_PARAM_CONTEXT_IMAGE:
default:
ret = -EINVAL;
break;
Expand Down Expand Up @@ -2104,6 +2106,95 @@ static int get_protected(struct i915_gem_context *ctx,
return 0;
}

static int set_context_image(struct i915_gem_context *ctx,
struct drm_i915_gem_context_param *args)
{
struct i915_gem_context_param_context_image user;
struct intel_context *ce;
struct file *shmem_state;
unsigned long lookup;
void *state;
int ret = 0;

if (!IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API))
return -EINVAL;

if (!ctx->i915->params.enable_debug_only_api)
return -EINVAL;

if (args->size < sizeof(user))
return -EINVAL;

if (copy_from_user(&user, u64_to_user_ptr(args->value), sizeof(user)))
return -EFAULT;

if (user.mbz)
return -EINVAL;

if (user.flags & ~(I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX))
return -EINVAL;

lookup = 0;
if (user.flags & I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX)
lookup |= LOOKUP_USER_INDEX;

ce = lookup_user_engine(ctx, lookup, &user.engine);
if (IS_ERR(ce))
return PTR_ERR(ce);

if (user.size < ce->engine->context_size) {
ret = -EINVAL;
goto out_ce;
}

if (drm_WARN_ON_ONCE(&ctx->i915->drm,
test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) {
/*
* This is racy but for a debug only API, if userspace is keen
* to create and configure contexts, while simultaneously using
* them from a second thread, let them suffer by potentially not
* executing with the context image they just raced to apply.
*/
ret = -EBUSY;
goto out_ce;
}

state = kmalloc(ce->engine->context_size, GFP_KERNEL);
if (!state) {
ret = -ENOMEM;
goto out_ce;
}

if (copy_from_user(state, u64_to_user_ptr(user.image),
ce->engine->context_size)) {
ret = -EFAULT;
goto out_state;
}

shmem_state = shmem_create_from_data(ce->engine->name,
state, ce->engine->context_size);
if (IS_ERR(shmem_state)) {
ret = PTR_ERR(shmem_state);
goto out_state;
}

if (intel_context_set_own_state(ce)) {
ret = -EBUSY;
fput(shmem_state);
goto out_state;
}

ce->default_state = shmem_state;

args->size = sizeof(user);

out_state:
kfree(state);
out_ce:
intel_context_put(ce);
return ret;
}

static int ctx_setparam(struct drm_i915_file_private *fpriv,
struct i915_gem_context *ctx,
struct drm_i915_gem_context_param *args)
Expand Down Expand Up @@ -2156,6 +2247,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
ret = set_persistence(ctx, args);
break;

case I915_CONTEXT_PARAM_CONTEXT_IMAGE:
ret = set_context_image(ctx, args);
break;

case I915_CONTEXT_PARAM_PROTECTED_CONTENT:
case I915_CONTEXT_PARAM_NO_ZEROMAP:
case I915_CONTEXT_PARAM_BAN_PERIOD:
Expand Down Expand Up @@ -2500,6 +2595,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
case I915_CONTEXT_PARAM_BAN_PERIOD:
case I915_CONTEXT_PARAM_ENGINES:
case I915_CONTEXT_PARAM_RINGSIZE:
case I915_CONTEXT_PARAM_CONTEXT_IMAGE:
default:
ret = -EINVAL;
break;
Expand Down Expand Up @@ -2612,5 +2708,22 @@ int __init i915_gem_context_module_init(void)
if (!slab_luts)
return -ENOMEM;

if (IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API)) {
pr_notice("**************************************************************\n");
pr_notice("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_notice("** **\n");
if (i915_modparams.enable_debug_only_api)
pr_notice("** i915.enable_debug_only_api is intended to be set **\n");
else
pr_notice("** CONFIG_DRM_I915_REPLAY_GPU_HANGS_API builds are intended **\n");
pr_notice("** for specific userspace graphics stack developers only! **\n");
pr_notice("** **\n");
pr_notice("** If you are seeing this message please report this to the **\n");
pr_notice("** provider of your kernel build. **\n");
pr_notice("** **\n");
pr_notice("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_notice("**************************************************************\n");
}

return 0;
}
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj)
* @bo: The ttm buffer object.
*
* This function prepares an object for move by removing all GPU bindings,
* removing all CPU mapings and finally releasing the pages sg-table.
* removing all CPU mappings and finally releasing the pages sg-table.
*
* Return: 0 if successful, negative error code on error.
*/
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ static void rcu_context_free(struct rcu_head *rcu)
struct intel_context *ce = container_of(rcu, typeof(*ce), rcu);

trace_intel_context_free(ce);
if (intel_context_has_own_state(ce))
fput(ce->default_state);
kmem_cache_free(slab_ce, ce);
}

Expand Down
22 changes: 22 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,28 @@ intel_context_clear_nopreempt(struct intel_context *ce)
clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
}

#if IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API)
static inline bool intel_context_has_own_state(const struct intel_context *ce)
{
return test_bit(CONTEXT_OWN_STATE, &ce->flags);
}

static inline bool intel_context_set_own_state(struct intel_context *ce)
{
return test_and_set_bit(CONTEXT_OWN_STATE, &ce->flags);
}
#else
static inline bool intel_context_has_own_state(const struct intel_context *ce)
{
return false;
}

static inline bool intel_context_set_own_state(struct intel_context *ce)
{
return true;
}
#endif

u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);

Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_context_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ struct intel_context {
struct i915_address_space *vm;
struct i915_gem_context __rcu *gem_context;

struct file *default_state;

/*
* @signal_lock protects the list of requests that need signaling,
* @signals. While there are any requests that need signaling,
Expand Down Expand Up @@ -131,6 +133,7 @@ struct intel_context {
#define CONTEXT_IS_PARKING 12
#define CONTEXT_EXITING 13
#define CONTEXT_LOW_LATENCY 14
#define CONTEXT_OWN_STATE 15

struct {
u64 timeout_us;
Expand Down
8 changes: 5 additions & 3 deletions drivers/gpu/drm/i915/gt/intel_lrc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1017,9 +1017,8 @@ void lrc_init_state(struct intel_context *ce,

set_redzone(state, engine);

if (engine->default_state) {
shmem_read(engine->default_state, 0,
state, engine->context_size);
if (ce->default_state) {
shmem_read(ce->default_state, 0, state, engine->context_size);
__set_bit(CONTEXT_VALID_BIT, &ce->flags);
inhibit = false;
}
Expand Down Expand Up @@ -1131,6 +1130,9 @@ int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)

GEM_BUG_ON(ce->state);

if (!intel_context_has_own_state(ce))
ce->default_state = engine->default_state;

vma = __lrc_alloc_state(ce, engine);
if (IS_ERR(vma))
return PTR_ERR(vma);
Expand Down
8 changes: 5 additions & 3 deletions drivers/gpu/drm/i915/gt/intel_ring_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,7 @@ static int ring_context_init_default_state(struct intel_context *ce,
if (IS_ERR(vaddr))
return PTR_ERR(vaddr);

shmem_read(ce->engine->default_state, 0,
vaddr, ce->engine->context_size);
shmem_read(ce->default_state, 0, vaddr, ce->engine->context_size);

i915_gem_object_flush_map(obj);
__i915_gem_object_release_map(obj);
Expand All @@ -491,7 +490,7 @@ static int ring_context_pre_pin(struct intel_context *ce,
struct i915_address_space *vm;
int err = 0;

if (ce->engine->default_state &&
if (ce->default_state &&
!test_bit(CONTEXT_VALID_BIT, &ce->flags)) {
err = ring_context_init_default_state(ce, ww);
if (err)
Expand Down Expand Up @@ -570,6 +569,9 @@ static int ring_context_alloc(struct intel_context *ce)
{
struct intel_engine_cs *engine = ce->engine;

if (!intel_context_has_own_state(ce))
ce->default_state = engine->default_state;

/* One ringbuffer to rule them all */
GEM_BUG_ON(!engine->legacy.ring);
ce->ring = engine->legacy.ring;
Expand Down
8 changes: 8 additions & 0 deletions drivers/gpu/drm/i915/gt/intel_workarounds.c
Original file line number Diff line number Diff line change
Expand Up @@ -1590,6 +1590,14 @@ xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
*/
wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);

/*
* Wa_14018575942
*
* Issue is seen on media KPI test running on VDBOX engine
* especially VP9 encoding WLs
*/
wa_write_or(wal, XELPMP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);

/* Wa_22016670082 */
wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);

Expand Down
Loading

0 comments on commit a78313b

Please sign in to comment.