Skip to content

Commit

Permalink
drm/i915: use pat_index instead of cache_level
Browse files Browse the repository at this point in the history
Currently the KMD is using enum i915_cache_level to set caching policy for
buffer objects. This is flaky because the PAT index which really controls
the caching behavior in PTE has far more levels than what's defined in the
enum. In addition, the PAT index is platform dependent, having to translate
between i915_cache_level and PAT index is not reliable, and makes the code
more complicated.

From UMD's perspective there is also a necessity to set caching policy for
performance fine tuning. It's much easier for the UMD to directly use PAT
index because the behavior of each PAT index is clearly defined in Bspec.
Having the abstracted i915_cache_level sitting in between would only cause
more ambiguity. PAT is expected to work much like MOCS already works today,
and by design userspace is expected to select the index that exactly
matches the desired behavior described in the hardware specification.

For these reasons this patch replaces i915_cache_level with PAT index. Also
note, the cache_level is not completely removed yet, because the KMD still
has the need of creating buffer objects with simple cache settings such as
cached, uncached, or writethrough. For kernel objects, cache_level is used
for simplicity and backward compatibility. For Pre-gen12 platforms PAT can
have 1:1 mapping to i915_cache_level, so these two are interchangeable. see
the use of LEGACY_CACHELEVEL.

One consequence of this change is that gen8_pte_encode is no longer working
for gen12 platforms due to the fact that gen12 platforms has different PAT
definitions. In the meantime the mtl_pte_encode introduced specfically for
MTL becomes generic for all gen12 platforms. This patch renames the MTL
PTE encode function into gen12_pte_encode and apply it to all gen12. Even
though this change looks unrelated, but separating them would temporarily
break gen12 PTE encoding, thus squash them in one patch.

Special note: this patch changes the way caching behavior is controlled in
the sense that some objects are left to be managed by userspace. For such
objects we need to be careful not to change the userspace settings.There
are kerneldoc and comments added around obj->cache_coherent, cache_dirty,
and how to bypass the checkings by i915_gem_object_has_cache_level. For
full understanding, these changes need to be looked at together with the
two follow-up patches, one disables the {set|get}_caching ioctl's and the
other adds set_pat extension to the GEM_CREATE uAPI.

Bspec: 63019

Cc: Chris Wilson <chris.p.wilson@linux.intel.com>
Signed-off-by: Fei Yang <fei.yang@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230509165200.1740-3-fei.yang@intel.com
  • Loading branch information
Fei Yang authored and Andi Shyti committed May 11, 2023
1 parent 5e352e3 commit 9275277
Show file tree
Hide file tree
Showing 36 changed files with 453 additions and 238 deletions.
12 changes: 6 additions & 6 deletions drivers/gpu/drm/i915/display/intel_dpt.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,24 @@ static void gen8_set_pte(void __iomem *addr, gen8_pte_t pte)
static void dpt_insert_page(struct i915_address_space *vm,
dma_addr_t addr,
u64 offset,
enum i915_cache_level level,
unsigned int pat_index,
u32 flags)
{
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;

gen8_set_pte(base + offset / I915_GTT_PAGE_SIZE,
vm->pte_encode(addr, level, flags));
vm->pte_encode(addr, pat_index, flags));
}

static void dpt_insert_entries(struct i915_address_space *vm,
struct i915_vma_resource *vma_res,
enum i915_cache_level level,
unsigned int pat_index,
u32 flags)
{
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;
const gen8_pte_t pte_encode = vm->pte_encode(0, level, flags);
const gen8_pte_t pte_encode = vm->pte_encode(0, pat_index, flags);
struct sgt_iter sgt_iter;
dma_addr_t addr;
int i;
Expand All @@ -83,7 +83,7 @@ static void dpt_clear_range(struct i915_address_space *vm,
static void dpt_bind_vma(struct i915_address_space *vm,
struct i915_vm_pt_stash *stash,
struct i915_vma_resource *vma_res,
enum i915_cache_level cache_level,
unsigned int pat_index,
u32 flags)
{
u32 pte_flags;
Expand All @@ -98,7 +98,7 @@ static void dpt_bind_vma(struct i915_address_space *vm,
if (vma_res->bi.lmem)
pte_flags |= PTE_LM;

vm->insert_entries(vm, vma_res, cache_level, pte_flags);
vm->insert_entries(vm, vma_res, pat_index, pte_flags);

vma_res->page_sizes_gtt = I915_GTT_PAGE_SIZE;

Expand Down
58 changes: 40 additions & 18 deletions drivers/gpu/drm/i915/gem/i915_gem_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,15 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
if (IS_DGFX(i915))
return false;

return !(obj->cache_level == I915_CACHE_NONE ||
obj->cache_level == I915_CACHE_WT);
/*
* For objects created by userspace through GEM_CREATE with pat_index
* set by set_pat extension, i915_gem_object_has_cache_level() will
* always return true, because the coherency of such object is managed
* by userspace. Othereise the call here would fall back to checking
* whether the object is un-cached or write-through.
*/
return !(i915_gem_object_has_cache_level(obj, I915_CACHE_NONE) ||
i915_gem_object_has_cache_level(obj, I915_CACHE_WT));
}

bool i915_gem_cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
Expand Down Expand Up @@ -267,7 +274,13 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
{
int ret;

if (obj->cache_level == cache_level)
/*
* For objects created by userspace through GEM_CREATE with pat_index
* set by set_pat extension, simply return 0 here without touching
* the cache setting, because such objects should have an immutable
* cache setting by desgin and always managed by userspace.
*/
if (i915_gem_object_has_cache_level(obj, cache_level))
return 0;

ret = i915_gem_object_wait(obj,
Expand All @@ -278,10 +291,8 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
return ret;

/* Always invalidate stale cachelines */
if (obj->cache_level != cache_level) {
i915_gem_object_set_cache_coherency(obj, cache_level);
obj->cache_dirty = true;
}
i915_gem_object_set_cache_coherency(obj, cache_level);
obj->cache_dirty = true;

/* The cache-level will be applied when each vma is rebound. */
return i915_gem_object_unbind(obj,
Expand All @@ -306,20 +317,22 @@ int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
goto out;
}

switch (obj->cache_level) {
case I915_CACHE_LLC:
case I915_CACHE_L3_LLC:
args->caching = I915_CACHING_CACHED;
break;
/*
* This ioctl should be disabled for the objects with pat_index
* set by user space.
*/
if (obj->pat_set_by_user) {
err = -EOPNOTSUPP;
goto out;
}

case I915_CACHE_WT:
if (i915_gem_object_has_cache_level(obj, I915_CACHE_LLC) ||
i915_gem_object_has_cache_level(obj, I915_CACHE_L3_LLC))
args->caching = I915_CACHING_CACHED;
else if (i915_gem_object_has_cache_level(obj, I915_CACHE_WT))
args->caching = I915_CACHING_DISPLAY;
break;

default:
else
args->caching = I915_CACHING_NONE;
break;
}
out:
rcu_read_unlock();
return err;
Expand Down Expand Up @@ -364,6 +377,15 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
if (!obj)
return -ENOENT;

/*
* This ioctl should be disabled for the objects with pat_index
* set by user space.
*/
if (obj->pat_set_by_user) {
ret = -EOPNOTSUPP;
goto out;
}

/*
* The caching mode of proxy object is handled by its generator, and
* not allowed to be changed by userspace.
Expand Down
15 changes: 12 additions & 3 deletions drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -640,9 +640,15 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
return false;

/*
* For objects created by userspace through GEM_CREATE with pat_index
* set by set_pat extension, i915_gem_object_has_cache_level() always
* return true, otherwise the call would fall back to checking whether
* the object is un-cached.
*/
return (cache->has_llc ||
obj->cache_dirty ||
obj->cache_level != I915_CACHE_NONE);
!i915_gem_object_has_cache_level(obj, I915_CACHE_NONE));
}

static int eb_reserve_vma(struct i915_execbuffer *eb,
Expand Down Expand Up @@ -1324,7 +1330,10 @@ static void *reloc_iomap(struct i915_vma *batch,
if (drm_mm_node_allocated(&cache->node)) {
ggtt->vm.insert_page(&ggtt->vm,
i915_gem_object_get_dma_address(obj, page),
offset, I915_CACHE_NONE, 0);
offset,
i915_gem_get_pat_index(ggtt->vm.i915,
I915_CACHE_NONE),
0);
} else {
offset += page << PAGE_SHIFT;
}
Expand Down Expand Up @@ -1464,7 +1473,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
reloc_cache_unmap(&eb->reloc_cache);
mutex_lock(&vma->vm->mutex);
err = i915_vma_bind(target->vma,
target->vma->obj->cache_level,
target->vma->obj->pat_index,
PIN_GLOBAL, NULL, NULL);
mutex_unlock(&vma->vm->mutex);
reloc_cache_remap(&eb->reloc_cache, ev->vma->obj);
Expand Down
11 changes: 10 additions & 1 deletion drivers/gpu/drm/i915/gem/i915_gem_mman.c
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,16 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
}

/* Access to snoopable pages through the GTT is incoherent. */
if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(i915)) {
/*
* For objects created by userspace through GEM_CREATE with pat_index
* set by set_pat extension, coherency is managed by userspace, make
* sure we don't fail handling the vm fault by calling
* i915_gem_object_has_cache_level() which always return true for such
* objects. Otherwise this helper function would fall back to checking
* whether the object is un-cached.
*/
if (!(i915_gem_object_has_cache_level(obj, I915_CACHE_NONE) ||
HAS_LLC(i915))) {
ret = -EFAULT;
goto err_unpin;
}
Expand Down
51 changes: 50 additions & 1 deletion drivers/gpu/drm/i915/gem/i915_gem_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,24 @@ unsigned int i915_gem_get_pat_index(struct drm_i915_private *i915,
return INTEL_INFO(i915)->cachelevel_to_pat[level];
}

bool i915_gem_object_has_cache_level(const struct drm_i915_gem_object *obj,
enum i915_cache_level lvl)
{
/*
* In case the pat_index is set by user space, this kernel mode
* driver should leave the coherency to be managed by user space,
* simply return true here.
*/
if (obj->pat_set_by_user)
return true;

/*
* Otherwise the pat_index should have been converted from cache_level
* so that the following comparison is valid.
*/
return obj->pat_index == i915_gem_get_pat_index(obj_to_i915(obj), lvl);
}

struct drm_i915_gem_object *i915_gem_object_alloc(void)
{
struct drm_i915_gem_object *obj;
Expand Down Expand Up @@ -133,7 +151,7 @@ void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj,
{
struct drm_i915_private *i915 = to_i915(obj->base.dev);

obj->cache_level = cache_level;
obj->pat_index = i915_gem_get_pat_index(i915, cache_level);

if (cache_level != I915_CACHE_NONE)
obj->cache_coherent = (I915_BO_CACHE_COHERENT_FOR_READ |
Expand All @@ -148,6 +166,37 @@ void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj,
!IS_DGFX(i915);
}

/**
* i915_gem_object_set_pat_index - set PAT index to be used in PTE encode
* @obj: #drm_i915_gem_object
* @pat_index: PAT index
*
* This is a clone of i915_gem_object_set_cache_coherency taking pat index
* instead of cache_level as its second argument.
*/
void i915_gem_object_set_pat_index(struct drm_i915_gem_object *obj,
unsigned int pat_index)
{
struct drm_i915_private *i915 = to_i915(obj->base.dev);

if (obj->pat_index == pat_index)
return;

obj->pat_index = pat_index;

if (pat_index != i915_gem_get_pat_index(i915, I915_CACHE_NONE))
obj->cache_coherent = (I915_BO_CACHE_COHERENT_FOR_READ |
I915_BO_CACHE_COHERENT_FOR_WRITE);
else if (HAS_LLC(i915))
obj->cache_coherent = I915_BO_CACHE_COHERENT_FOR_READ;
else
obj->cache_coherent = 0;

obj->cache_dirty =
!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE) &&
!IS_DGFX(i915);
}

bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj)
{
struct drm_i915_private *i915 = to_i915(obj->base.dev);
Expand Down
4 changes: 4 additions & 0 deletions drivers/gpu/drm/i915/gem/i915_gem_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ static inline bool i915_gem_object_size_2big(u64 size)

unsigned int i915_gem_get_pat_index(struct drm_i915_private *i915,
enum i915_cache_level level);
bool i915_gem_object_has_cache_level(const struct drm_i915_gem_object *obj,
enum i915_cache_level lvl);
void i915_gem_init__objects(struct drm_i915_private *i915);

void i915_objects_module_exit(void);
Expand Down Expand Up @@ -764,6 +766,8 @@ bool i915_gem_object_has_unknown_state(struct drm_i915_gem_object *obj);

void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj,
unsigned int cache_level);
void i915_gem_object_set_pat_index(struct drm_i915_gem_object *obj,
unsigned int pat_index);
bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj);
void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj);
void i915_gem_object_flush_if_display_locked(struct drm_i915_gem_object *obj);
Expand Down
46 changes: 42 additions & 4 deletions drivers/gpu/drm/i915/gem/i915_gem_object_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,15 +364,43 @@ struct drm_i915_gem_object {
#define I915_BO_FLAG_STRUCT_PAGE BIT(0) /* Object backed by struct pages */
#define I915_BO_FLAG_IOMEM BIT(1) /* Object backed by IO memory */
/**
* @cache_level: The desired GTT caching level.
* @pat_index: The desired PAT index.
*
* See hardware specification for valid PAT indices for each platform.
* This field replaces the @cache_level that contains a value of enum
* i915_cache_level since PAT indices are being used by both userspace
* and kernel mode driver for caching policy control after GEN12.
* In the meantime platform specific tables are created to translate
* i915_cache_level into pat index, for more details check the macros
* defined i915/i915_pci.c, e.g. PVC_CACHELEVEL.
* For backward compatibility, this field contains values exactly match
* the entries of enum i915_cache_level for pre-GEN12 platforms (See
* LEGACY_CACHELEVEL), so that the PTE encode functions for these
* legacy platforms can stay the same.
*/
unsigned int pat_index:6;
/**
* @pat_set_by_user: Indicate whether pat_index is set by user space
*
* See enum i915_cache_level for possible values, along with what
* each does.
* This field is set to false by default, only set to true if the
* pat_index is set by user space. By design, user space is capable of
* managing caching behavior by setting pat_index, in which case this
* kernel mode driver should never touch the pat_index.
*/
unsigned int cache_level:3;
unsigned int pat_set_by_user:1;
/**
* @cache_coherent:
*
* Note: with the change above which replaced @cache_level with pat_index,
* the use of @cache_coherent is limited to the objects created by kernel
* or by userspace without pat index specified.
* Check for @pat_set_by_user to find out if an object has pat index set
* by userspace. The ioctl's to change cache settings have also been
* disabled for the objects with pat index set by userspace. Please don't
* assume @cache_coherent having the flags set as describe here. A helper
* function i915_gem_object_has_cache_level() provides one way to bypass
* the use of this field.
*
* Track whether the pages are coherent with the GPU if reading or
* writing through the CPU caches. The largely depends on the
* @cache_level setting.
Expand Down Expand Up @@ -446,6 +474,16 @@ struct drm_i915_gem_object {
/**
* @cache_dirty:
*
* Note: with the change above which replaced cache_level with pat_index,
* the use of @cache_dirty is limited to the objects created by kernel
* or by userspace without pat index specified.
* Check for @pat_set_by_user to find out if an object has pat index set
* by userspace. The ioctl's to change cache settings have also been
* disabled for the objects with pat_index set by userspace. Please don't
* assume @cache_dirty is set as describe here. Also see helper function
* i915_gem_object_has_cache_level() for possible ways to bypass the use
* of this field.
*
* Track if we are we dirty with writes through the CPU cache for this
* object. As a result reading directly from main memory might yield
* stale data.
Expand Down
4 changes: 3 additions & 1 deletion drivers/gpu/drm/i915/gem/i915_gem_stolen.c
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,9 @@ static void dbg_poison(struct i915_ggtt *ggtt,

ggtt->vm.insert_page(&ggtt->vm, addr,
ggtt->error_capture.start,
I915_CACHE_NONE, 0);
i915_gem_get_pat_index(ggtt->vm.i915,
I915_CACHE_NONE),
0);
mb();

s = io_mapping_map_wc(&ggtt->iomap,
Expand Down
8 changes: 5 additions & 3 deletions drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ static struct dma_fence *i915_ttm_accel_move(struct ttm_buffer_object *bo,

intel_engine_pm_get(to_gt(i915)->migrate.context->engine);
ret = intel_context_migrate_clear(to_gt(i915)->migrate.context, deps,
dst_st->sgl, dst_level,
dst_st->sgl,
i915_gem_get_pat_index(i915, dst_level),
i915_ttm_gtt_binds_lmem(dst_mem),
0, &rq);
} else {
Expand All @@ -228,9 +229,10 @@ static struct dma_fence *i915_ttm_accel_move(struct ttm_buffer_object *bo,
intel_engine_pm_get(to_gt(i915)->migrate.context->engine);
ret = intel_context_migrate_copy(to_gt(i915)->migrate.context,
deps, src_rsgt->table.sgl,
src_level,
i915_gem_get_pat_index(i915, src_level),
i915_ttm_gtt_binds_lmem(bo->resource),
dst_st->sgl, dst_level,
dst_st->sgl,
i915_gem_get_pat_index(i915, dst_level),
i915_ttm_gtt_binds_lmem(dst_mem),
&rq);

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/i915/gem/selftests/huge_pages.c
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ fake_huge_pages_object(struct drm_i915_private *i915, u64 size, bool single)

obj->write_domain = I915_GEM_DOMAIN_CPU;
obj->read_domains = I915_GEM_DOMAIN_CPU;
obj->cache_level = I915_CACHE_NONE;
obj->pat_index = i915_gem_get_pat_index(i915, I915_CACHE_NONE);

return obj;
}
Expand Down
Loading

0 comments on commit 9275277

Please sign in to comment.