Skip to content

Commit

Permalink
drm/i915: Preallocate stashes for vma page-directories
Browse files Browse the repository at this point in the history
We need to make the DMA allocations used for page directories to be
performed up front so that we can include those allocations in our
memory reservation pass. The downside is that we have to assume the
worst case, even before we know the final layout, and always allocate
enough page directories for this object, even when there will be overlap.
This unfortunately can be quite expensive, especially as we have to
clear/reset the page directories and DMA pages, but it should only be
required during early phases of a workload when new objects are being
discovered, or after memory/eviction pressure when we need to rebind.
Once we reach steady state, the objects should not be moved and we no
longer need to preallocating the pages tables.

It should be noted that the lifetime for the page directories DMA is
more or less decoupled from individual fences as they will be shared
across objects across timelines.

v2: Only allocate enough PD space for the PTE we may use, we do not need
to allocate PD that will be left as scratch.
v3: Store the shift unto the first PD level to encapsulate the different
PTE counts for gen6/gen8.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200729164219.5737-1-chris@chris-wilson.co.uk
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
  • Loading branch information
Chris Wilson authored and Joonas Lahtinen committed Sep 7, 2020
1 parent b3786b2 commit cd0452a
Show file tree
Hide file tree
Showing 9 changed files with 237 additions and 190 deletions.
11 changes: 6 additions & 5 deletions drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ static void vma_clear_pages(struct i915_vma *vma)
vma->pages = NULL;
}

static int vma_bind(struct i915_address_space *vm,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 flags)
static void vma_bind(struct i915_address_space *vm,
struct i915_vm_pt_stash *stash,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 flags)
{
return vm->vma_ops.bind_vma(vm, vma, cache_level, flags);
vm->vma_ops.bind_vma(vm, stash, vma, cache_level, flags);
}

static void vma_unbind(struct i915_address_space *vm, struct i915_vma *vma)
Expand Down
40 changes: 14 additions & 26 deletions drivers/gpu/drm/i915/gt/gen6_ppgtt.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,16 @@ static void gen6_flush_pd(struct gen6_ppgtt *ppgtt, u64 start, u64 end)
mutex_unlock(&ppgtt->flush);
}

static int gen6_alloc_va_range(struct i915_address_space *vm,
u64 start, u64 length)
static void gen6_alloc_va_range(struct i915_address_space *vm,
struct i915_vm_pt_stash *stash,
u64 start, u64 length)
{
struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm));
struct i915_page_directory * const pd = ppgtt->base.pd;
struct i915_page_table *pt, *alloc = NULL;
struct i915_page_table *pt;
bool flush = false;
u64 from = start;
unsigned int pde;
int ret = 0;

spin_lock(&pd->lock);
gen6_for_each_pde(pt, pd, start, length, pde) {
Expand All @@ -195,21 +195,17 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
if (px_base(pt) == px_base(&vm->scratch[1])) {
spin_unlock(&pd->lock);

pt = fetch_and_zero(&alloc);
if (!pt)
pt = alloc_pt(vm);
if (IS_ERR(pt)) {
ret = PTR_ERR(pt);
goto unwind_out;
}
pt = stash->pt[0];
GEM_BUG_ON(!pt);

fill32_px(pt, vm->scratch[0].encode);

spin_lock(&pd->lock);
if (pd->entry[pde] == &vm->scratch[1]) {
stash->pt[0] = pt->stash;
atomic_set(&pt->used, 0);
pd->entry[pde] = pt;
} else {
alloc = pt;
pt = pd->entry[pde];
}

Expand All @@ -226,15 +222,6 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
with_intel_runtime_pm(&vm->i915->runtime_pm, wakeref)
gen6_flush_pd(ppgtt, from, start);
}

goto out;

unwind_out:
gen6_ppgtt_clear_range(vm, from, start - from);
out:
if (alloc)
free_px(vm, alloc);
return ret;
}

static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt)
Expand Down Expand Up @@ -302,10 +289,11 @@ static void pd_vma_clear_pages(struct i915_vma *vma)
vma->pages = NULL;
}

static int pd_vma_bind(struct i915_address_space *vm,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 unused)
static void pd_vma_bind(struct i915_address_space *vm,
struct i915_vm_pt_stash *stash,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 unused)
{
struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
struct gen6_ppgtt *ppgtt = vma->private;
Expand All @@ -315,7 +303,6 @@ static int pd_vma_bind(struct i915_address_space *vm,
ppgtt->pd_addr = (gen6_pte_t __iomem *)ggtt->gsm + ggtt_offset;

gen6_flush_pd(ppgtt, 0, ppgtt->base.vm.total);
return 0;
}

static void pd_vma_unbind(struct i915_address_space *vm, struct i915_vma *vma)
Expand Down Expand Up @@ -448,6 +435,7 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt)
mutex_init(&ppgtt->pin_mutex);

ppgtt_init(&ppgtt->base, gt);
ppgtt->base.vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen6_pte_t));
ppgtt->base.vm.top = 1;

ppgtt->base.vm.bind_async_flags = I915_VMA_LOCAL_BIND;
Expand Down
78 changes: 23 additions & 55 deletions drivers/gpu/drm/i915/gt/gen8_ppgtt.c
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,12 @@ static void gen8_ppgtt_clear(struct i915_address_space *vm,
start, start + length, vm->top);
}

static int __gen8_ppgtt_alloc(struct i915_address_space * const vm,
struct i915_page_directory * const pd,
u64 * const start, const u64 end, int lvl)
static void __gen8_ppgtt_alloc(struct i915_address_space * const vm,
struct i915_vm_pt_stash *stash,
struct i915_page_directory * const pd,
u64 * const start, const u64 end, int lvl)
{
const struct i915_page_scratch * const scratch = &vm->scratch[lvl];
struct i915_page_table *alloc = NULL;
unsigned int idx, len;
int ret = 0;

GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT);

Expand All @@ -297,49 +295,30 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm,
DBG("%s(%p):{ lvl:%d, idx:%d } allocating new tree\n",
__func__, vm, lvl + 1, idx);

pt = fetch_and_zero(&alloc);
if (lvl) {
if (!pt) {
pt = &alloc_pd(vm)->pt;
if (IS_ERR(pt)) {
ret = PTR_ERR(pt);
goto out;
}
}
pt = stash->pt[!!lvl];
GEM_BUG_ON(!pt);

if (lvl ||
gen8_pt_count(*start, end) < I915_PDES ||
intel_vgpu_active(vm->i915))
fill_px(pt, vm->scratch[lvl].encode);
} else {
if (!pt) {
pt = alloc_pt(vm);
if (IS_ERR(pt)) {
ret = PTR_ERR(pt);
goto out;
}
}

if (intel_vgpu_active(vm->i915) ||
gen8_pt_count(*start, end) < I915_PDES)
fill_px(pt, vm->scratch[lvl].encode);
}

spin_lock(&pd->lock);
if (likely(!pd->entry[idx]))
if (likely(!pd->entry[idx])) {
stash->pt[!!lvl] = pt->stash;
atomic_set(&pt->used, 0);
set_pd_entry(pd, idx, pt);
else
alloc = pt, pt = pd->entry[idx];
} else {
pt = pd->entry[idx];
}
}

if (lvl) {
atomic_inc(&pt->used);
spin_unlock(&pd->lock);

ret = __gen8_ppgtt_alloc(vm, as_pd(pt),
start, end, lvl);
if (unlikely(ret)) {
if (release_pd_entry(pd, idx, pt, scratch))
free_px(vm, pt);
goto out;
}
__gen8_ppgtt_alloc(vm, stash,
as_pd(pt), start, end, lvl);

spin_lock(&pd->lock);
atomic_dec(&pt->used);
Expand All @@ -359,34 +338,22 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm,
}
} while (idx++, --len);
spin_unlock(&pd->lock);
out:
if (alloc)
free_px(vm, alloc);
return ret;
}

static int gen8_ppgtt_alloc(struct i915_address_space *vm,
u64 start, u64 length)
static void gen8_ppgtt_alloc(struct i915_address_space *vm,
struct i915_vm_pt_stash *stash,
u64 start, u64 length)
{
u64 from;
int err;

GEM_BUG_ON(!IS_ALIGNED(start, BIT_ULL(GEN8_PTE_SHIFT)));
GEM_BUG_ON(!IS_ALIGNED(length, BIT_ULL(GEN8_PTE_SHIFT)));
GEM_BUG_ON(range_overflows(start, length, vm->total));

start >>= GEN8_PTE_SHIFT;
length >>= GEN8_PTE_SHIFT;
GEM_BUG_ON(length == 0);
from = start;

err = __gen8_ppgtt_alloc(vm, i915_vm_to_ppgtt(vm)->pd,
&start, start + length, vm->top);
if (unlikely(err && from != start))
__gen8_ppgtt_clear(vm, i915_vm_to_ppgtt(vm)->pd,
from, start, vm->top);

return err;
__gen8_ppgtt_alloc(vm, stash, i915_vm_to_ppgtt(vm)->pd,
&start, start + length, vm->top);
}

static __always_inline void
Expand Down Expand Up @@ -703,6 +670,7 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt)

ppgtt_init(ppgtt, gt);
ppgtt->vm.top = i915_vm_is_4lvl(&ppgtt->vm) ? 3 : 2;
ppgtt->vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen8_pte_t));

/*
* From bdw, there is hw support for read-only pages in the PPGTT.
Expand Down
60 changes: 27 additions & 33 deletions drivers/gpu/drm/i915/gt/intel_ggtt.c
Original file line number Diff line number Diff line change
Expand Up @@ -436,16 +436,17 @@ static void i915_ggtt_clear_range(struct i915_address_space *vm,
intel_gtt_clear_range(start >> PAGE_SHIFT, length >> PAGE_SHIFT);
}

static int ggtt_bind_vma(struct i915_address_space *vm,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 flags)
static void ggtt_bind_vma(struct i915_address_space *vm,
struct i915_vm_pt_stash *stash,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 flags)
{
struct drm_i915_gem_object *obj = vma->obj;
u32 pte_flags;

if (i915_vma_is_bound(vma, ~flags & I915_VMA_BIND_MASK))
return 0;
return;

/* Applicable to VLV (gen8+ do not support RO in the GGTT) */
pte_flags = 0;
Expand All @@ -454,8 +455,6 @@ static int ggtt_bind_vma(struct i915_address_space *vm,

vm->insert_entries(vm, vma, cache_level, pte_flags);
vma->page_sizes.gtt = I915_GTT_PAGE_SIZE;

return 0;
}

static void ggtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma)
Expand Down Expand Up @@ -568,31 +567,25 @@ static int init_ggtt(struct i915_ggtt *ggtt)
return ret;
}

static int aliasing_gtt_bind_vma(struct i915_address_space *vm,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 flags)
static void aliasing_gtt_bind_vma(struct i915_address_space *vm,
struct i915_vm_pt_stash *stash,
struct i915_vma *vma,
enum i915_cache_level cache_level,
u32 flags)
{
u32 pte_flags;
int ret;

/* Currently applicable only to VLV */
pte_flags = 0;
if (i915_gem_object_is_readonly(vma->obj))
pte_flags |= PTE_READ_ONLY;

if (flags & I915_VMA_LOCAL_BIND) {
struct i915_ppgtt *alias = i915_vm_to_ggtt(vm)->alias;

ret = ppgtt_bind_vma(&alias->vm, vma, cache_level, flags);
if (ret)
return ret;
}
if (flags & I915_VMA_LOCAL_BIND)
ppgtt_bind_vma(&i915_vm_to_ggtt(vm)->alias->vm,
stash, vma, cache_level, flags);

if (flags & I915_VMA_GLOBAL_BIND)
vm->insert_entries(vm, vma, cache_level, pte_flags);

return 0;
}

static void aliasing_gtt_unbind_vma(struct i915_address_space *vm,
Expand All @@ -607,6 +600,7 @@ static void aliasing_gtt_unbind_vma(struct i915_address_space *vm,

static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
{
struct i915_vm_pt_stash stash = {};
struct i915_ppgtt *ppgtt;
int err;

Expand All @@ -619,15 +613,17 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
goto err_ppgtt;
}

err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, ggtt->vm.total);
if (err)
goto err_ppgtt;

/*
* Note we only pre-allocate as far as the end of the global
* GTT. On 48b / 4-level page-tables, the difference is very,
* very significant! We have to preallocate as GVT/vgpu does
* not like the page directory disappearing.
*/
err = ppgtt->vm.allocate_va_range(&ppgtt->vm, 0, ggtt->vm.total);
if (err)
goto err_ppgtt;
ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, 0, ggtt->vm.total);

ggtt->alias = ppgtt;
ggtt->vm.bind_async_flags |= ppgtt->vm.bind_async_flags;
Expand All @@ -638,6 +634,7 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
GEM_BUG_ON(ggtt->vm.vma_ops.unbind_vma != ggtt_unbind_vma);
ggtt->vm.vma_ops.unbind_vma = aliasing_gtt_unbind_vma;

i915_vm_free_pt_stash(&ppgtt->vm, &stash);
return 0;

err_ppgtt:
Expand Down Expand Up @@ -1165,11 +1162,6 @@ void i915_ggtt_disable_guc(struct i915_ggtt *ggtt)
ggtt->invalidate(ggtt);
}

static unsigned int clear_bind(struct i915_vma *vma)
{
return atomic_fetch_and(~I915_VMA_BIND_MASK, &vma->flags);
}

void i915_ggtt_resume(struct i915_ggtt *ggtt)
{
struct i915_vma *vma;
Expand All @@ -1187,11 +1179,13 @@ void i915_ggtt_resume(struct i915_ggtt *ggtt)
/* clflush objects bound into the GGTT and rebind them. */
list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link) {
struct drm_i915_gem_object *obj = vma->obj;
unsigned int was_bound = clear_bind(vma);
unsigned int was_bound =
atomic_read(&vma->flags) & I915_VMA_BIND_MASK;

WARN_ON(i915_vma_bind(vma,
obj ? obj->cache_level : 0,
was_bound, NULL));
GEM_BUG_ON(!was_bound);
vma->ops->bind_vma(&ggtt->vm, NULL, vma,
obj ? obj->cache_level : 0,
was_bound);
if (obj) { /* only used during resume => exclusive access */
flush |= fetch_and_zero(&obj->write_domain);
obj->read_domains |= I915_GEM_DOMAIN_GTT;
Expand Down
Loading

0 comments on commit cd0452a

Please sign in to comment.