Skip to content

Commit

Permalink
drm/amdkfd: unregistered svm range not overlap with TTM range
Browse files Browse the repository at this point in the history
When creating unregistered new svm range to recover retry fault, avoid
new svm range to overlap with ranges or userptr ranges managed by TTM,
otherwise svm migration will trigger TTM or userptr eviction, to evict
user queues unexpectedly.

Change helper amdgpu_ttm_tt_affect_userptr to return userptr which is
inside the range. Add helper svm_range_check_vm_userptr to scan all
userptr of the vm, and return overlap userptr bo start, last.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
  • Loading branch information
Philip Yang authored and Philip Yang committed Oct 14, 2021
1 parent 9d2b459 commit df11094
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 11 deletions.
6 changes: 3 additions & 3 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,

list_for_each_entry(bo, &node->bos, mn_list) {

if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end))
if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end, NULL))
continue;

r = dma_resv_wait_timeout_rcu(amdkcl_ttm_resvp(&bo->tbo),
Expand Down Expand Up @@ -361,7 +361,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,

if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
range->start,
end))
end, NULL))
amdgpu_amdkfd_evict_userptr(mem, range->mm);
}
}
Expand Down Expand Up @@ -474,7 +474,7 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
struct kgd_mem *mem = bo->kfd_bo;

if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
start, end))
start, end, NULL))
amdgpu_amdkfd_evict_userptr(mem, mm);
}
}
Expand Down
8 changes: 6 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1456,7 +1456,7 @@ struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm)
*
*/
bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
unsigned long end)
unsigned long end, unsigned long *userptr)
{
struct amdgpu_ttm_tt *gtt = (void *)ttm;
unsigned long size;
Expand All @@ -1471,6 +1471,8 @@ bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
if (gtt->userptr > end || gtt->userptr + size <= start)
return false;

if (userptr)
*userptr = gtt->userptr;
return true;
}

Expand All @@ -1494,7 +1496,7 @@ bool amdgpu_ttm_tt_is_userptr(struct ttm_tt *ttm)
*
*/
bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
unsigned long end)
unsigned long end, unsigned long *userptr)
{
struct amdgpu_ttm_tt *gtt = (void *)ttm;
struct amdgpu_ttm_gup_task_list *entry;
Expand Down Expand Up @@ -1524,6 +1526,8 @@ bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,

atomic_inc(&gtt->mmu_invalidations);

if (userptr)
*userptr = gtt->userptr;
return true;
}

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ int amdgpu_ttm_tt_set_userptr(struct ttm_buffer_object *bo,
bool amdgpu_ttm_tt_has_userptr(struct ttm_tt *ttm);
struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm);
bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
unsigned long end);
unsigned long end, unsigned long *userptr);
#ifdef HAVE_AMDKCL_HMM_MIRROR_ENABLED
bool amdgpu_ttm_tt_is_userptr(struct ttm_tt *ttm);
#else
Expand Down
95 changes: 90 additions & 5 deletions drivers/gpu/drm/amd/amdkfd/kfd_svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ static bool
svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);

static int
svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
uint64_t *bo_s, uint64_t *bo_l);
static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
.invalidate = svm_range_cpu_invalidate_pagetables,
};
Expand Down Expand Up @@ -2304,6 +2306,7 @@ svm_range_best_restore_location(struct svm_range *prange,

return -1;
}

static int
svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
unsigned long *start, unsigned long *last)
Expand Down Expand Up @@ -2351,8 +2354,59 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
vma->vm_end >> PAGE_SHIFT, *last);

return 0;
}

static int
svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last,
uint64_t *bo_s, uint64_t *bo_l)
{
struct amdgpu_bo_va_mapping *mapping;
struct interval_tree_node *node;
struct amdgpu_bo *bo = NULL;
unsigned long userptr;
uint32_t i;
int r;

for (i = 0; i < p->n_pdds; i++) {
struct amdgpu_vm *vm;

if (!p->pdds[i]->drm_priv)
continue;

vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
r = amdgpu_bo_reserve(vm->root.bo, false);
if (r)
return r;

/* Check userptr by searching entire vm->va interval tree */
node = interval_tree_iter_first(&vm->va, 0, ~0ULL);
while (node) {
mapping = container_of((struct rb_node *)node,
struct amdgpu_bo_va_mapping, rb);
bo = mapping->bo_va->base.bo;

if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
start << PAGE_SHIFT,
last << PAGE_SHIFT,
&userptr)) {
node = interval_tree_iter_next(node, 0, ~0ULL);
continue;
}

pr_debug("[0x%llx 0x%llx] already userptr mapped\n",
start, last);
if (bo_s && bo_l) {
*bo_s = userptr >> PAGE_SHIFT;
*bo_l = *bo_s + bo->tbo.ttm->num_pages - 1;
}
amdgpu_bo_unreserve(vm->root.bo);
return -EADDRINUSE;
}
amdgpu_bo_unreserve(vm->root.bo);
}
return 0;
}

static struct
svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
struct kfd_process *p,
Expand All @@ -2362,10 +2416,26 @@ svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
struct svm_range *prange = NULL;
unsigned long start, last;
uint32_t gpuid, gpuidx;
uint64_t bo_s = 0;
uint64_t bo_l = 0;
int r;

if (svm_range_get_range_boundaries(p, addr, &start, &last))
return NULL;

r = svm_range_check_vm(p, start, last, &bo_s, &bo_l);
if (r != -EADDRINUSE)
r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l);

if (r == -EADDRINUSE) {
if (addr >= bo_s && addr <= bo_l)
return NULL;

/* Create one page svm range if 2MB range overlapping */
start = addr;
last = addr;
}

prange = svm_range_new(&p->svms, start, last);
if (!prange) {
pr_debug("Failed to create prange in address [0x%llx]\n", addr);
Expand Down Expand Up @@ -2668,6 +2738,8 @@ int svm_range_list_init(struct kfd_process *p)
* @p: current kfd_process
* @start: range start address, in pages
* @last: range last address, in pages
* @bo_s: mapping start address in pages if address range already mapped
* @bo_l: mapping last address in pages if address range already mapped
*
* The purpose is to avoid virtual address ranges already allocated by
* kfd_ioctl_alloc_memory_of_gpu ioctl.
Expand All @@ -2682,8 +2754,11 @@ int svm_range_list_init(struct kfd_process *p)
* a signal. Release all buffer reservations and return to user-space.
*/
static int
svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last)
svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
uint64_t *bo_s, uint64_t *bo_l)
{
struct amdgpu_bo_va_mapping *mapping;
struct interval_tree_node *node;
uint32_t i;
int r;

Expand All @@ -2697,8 +2772,17 @@ svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last)
r = amdgpu_bo_reserve(vm->root.bo, false);
if (r)
return r;
if (interval_tree_iter_first(&vm->va, start, last)) {
pr_debug("Range [0x%llx 0x%llx] already mapped\n", start, last);

node = interval_tree_iter_first(&vm->va, start, last);
if (node) {
pr_debug("range [0x%llx 0x%llx] already TTM mapped\n",
start, last);
mapping = container_of((struct rb_node *)node,
struct amdgpu_bo_va_mapping, rb);
if (bo_s && bo_l) {
*bo_s = mapping->start;
*bo_l = mapping->last;
}
amdgpu_bo_unreserve(vm->root.bo);
return -EADDRINUSE;
}
Expand Down Expand Up @@ -2739,7 +2823,8 @@ svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size)
start = min(end, vma->vm_end);
} while (start < end);

return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT);
return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL,
NULL);
}

/**
Expand Down

0 comments on commit df11094

Please sign in to comment.