Skip to content

Commit

Permalink
drm/amdkfd: Fix circular lock in nocpsch path
Browse files Browse the repository at this point in the history
Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
which is taken in MMU notifiers, potentially in FS reclaim context.
Taking another lock, which is BO reservation lock from free_mqd, while
causing an FS reclaim inside the DQM lock creates a problematic circular
lock dependency. Therefore move free_mqd out of
destroy_queue_nocpsch_locked and call it after unlocking DQM.

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Amber Lin authored and Alex Deucher committed Jun 15, 2021
1 parent d760895 commit a7b2451
Showing 1 changed file with 13 additions and 5 deletions.
18 changes: 13 additions & 5 deletions drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
if (retval == -ETIME)
qpd->reset_wavefronts = true;


mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);

list_del(&q->list);
if (list_empty(&qpd->queues_list)) {
if (qpd->reset_wavefronts) {
Expand Down Expand Up @@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
int retval;
uint64_t sdma_val = 0;
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
struct mqd_manager *mqd_mgr =
dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];

/* Get the SDMA queue stats */
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
Expand All @@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
pdd->sdma_past_activity_counter += sdma_val;
dqm_unlock(dqm);

mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);

return retval;
}

Expand Down Expand Up @@ -1629,20 +1630,27 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
static int process_termination_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q, *next;
struct queue *q;
struct device_process_node *cur, *next_dpn;
int retval = 0;
bool found = false;

dqm_lock(dqm);

/* Clear all user mode queues */
list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
while (!list_empty(&qpd->queues_list)) {
struct mqd_manager *mqd_mgr;
int ret;

q = list_first_entry(&qpd->queues_list, struct queue, list);
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
if (ret)
retval = ret;
dqm_unlock(dqm);
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
dqm_lock(dqm);
}

/* Unregister process */
Expand Down

0 comments on commit a7b2451

Please sign in to comment.