Skip to content

Commit

Permalink
drm/amdkfd: Free queue after unmap queue success
Browse files Browse the repository at this point in the history
After queue unmap or remove from MES successfully, free queue sysfs
entries, doorbell and remove from queue list. Otherwise, application may
destroy queue again, cause below kernel warning or crash backtrace.

For outstanding queues, either application forget to destroy or failed
to destroy, kfd_process_notifier_release will remove queue sysfs
entries, kfd_process_wq_release will free queue doorbell.

v2: decrement_queue_count for MES queue

 refcount_t: underflow; use-after-free.
 WARNING: CPU: 7 PID: 3053 at lib/refcount.c:28
  Call Trace:
   kobject_put+0xd6/0x1a0
   kfd_procfs_del_queue+0x27/0x30 [amdgpu]
   pqm_destroy_queue+0xeb/0x240 [amdgpu]
   kfd_ioctl_destroy_queue+0x32/0x70 [amdgpu]
   kfd_ioctl+0x27d/0x500 [amdgpu]
   do_syscall_64+0x35/0x80

 WARNING: CPU: 2 PID: 3053 at drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:400
  Call Trace:
   deallocate_doorbell.isra.0+0x39/0x40 [amdgpu]
   destroy_queue_cpsch+0xb3/0x270 [amdgpu]
   pqm_destroy_queue+0x108/0x240 [amdgpu]
   kfd_ioctl_destroy_queue+0x32/0x70 [amdgpu]
   kfd_ioctl+0x27d/0x500 [amdgpu]

 general protection fault, probably for non-canonical address
0xdead000000000108:
 Call Trace:
  pqm_destroy_queue+0xf0/0x200 [amdgpu]
  kfd_ioctl_destroy_queue+0x2f/0x60 [amdgpu]
  kfd_ioctl+0x19b/0x600 [amdgpu]

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Graham Sider <Graham.Sider@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Philip Yang authored and Alex Deucher committed Jun 22, 2022
1 parent f4f9b82 commit ab8529b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 12 deletions.
28 changes: 17 additions & 11 deletions drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1872,6 +1872,22 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,

}

if (q->properties.is_active) {
if (!dqm->dev->shared_resources.enable_mes) {
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
} else {
retval = remove_queue_mes(dqm, q, qpd);
}

if (retval)
goto failed_unmap_queue;

decrement_queue_count(dqm, qpd, q);
}

mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];

Expand All @@ -1885,17 +1901,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,

list_del(&q->list);
qpd->queue_count--;
if (q->properties.is_active) {
if (!dqm->dev->shared_resources.enable_mes) {
decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
} else {
retval = remove_queue_mes(dqm, q, qpd);
}
}

/*
* Unconditionally decrement this counter, regardless of the queue's
Expand All @@ -1912,6 +1917,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,

return retval;

failed_unmap_queue:
failed_try_destroy_debugged_queue:

dqm_unlock(dqm);
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
}

if (pqn->q) {
kfd_procfs_del_queue(pqn->q);
dqm = pqn->q->device->dqm;
retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
if (retval) {
Expand All @@ -439,6 +438,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
if (dev->shared_resources.enable_mes)
amdgpu_amdkfd_free_gtt_mem(dev->adev,
pqn->q->gang_ctx_bo);
kfd_procfs_del_queue(pqn->q);
uninit_queue(pqn->q);
}

Expand Down

0 comments on commit ab8529b

Please sign in to comment.