From 2a417b772dc0e36046f4f116391b15a598e74dcd Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 22 Oct 2021 12:24:36 -0400 Subject: [PATCH] drm/amdkfd: restore userptr ignore bad address error The userptr can be unmapped by application and still registered to driver, restore userptr work return user pages will get -EFAULT bad address error. Pretend this error as succeed. GPU access this userptr will have VM fault later, it is better than application soft hangs with stalled user mode queues. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 27 ++++++++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +++ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ee5d3a81d21b..61784bbfd7fd9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2456,18 +2456,25 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info, /* Get updated user pages */ ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages); if (ret) { - pr_debug("%s: Failed to get user pages: %d\n", - __func__, ret); + pr_debug("Failed %d to get user pages\n", ret); + + /* Return -EFAULT bad address error as success. It will + * fail later with a VM fault if the GPU tries to access + * it. Better than hanging indefinitely with stalled + * user mode queues. + * + * Return other error -EBUSY or -ENOMEM to retry restore + */ + if (ret != -EFAULT) + return ret; + } else { - /* Return error -EBUSY or -ENOMEM, retry restore */ - return ret; + /* + * FIXME: Cannot ignore the return code, must hold + * notifier_lock + */ + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); } - - /* - * FIXME: Cannot ignore the return code, must hold - * notifier_lock - */ - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); #else if (!mem->user_pages) { mem->user_pages = diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index eb732cc94016d..9174fc346de67 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -768,6 +768,9 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start, ttm->num_pages, >t->range, readonly, false, NULL); + if (r) + pr_debug("failed %d to get user pages 0x%llx\n", r, start); + out_putmm: mmput(mm);