Skip to content

Commit

Permalink
drm/amdkfd: Implement KFD process eviction/restore
Browse files Browse the repository at this point in the history
When the TTM memory manager in KGD evicts BOs, all user mode queues
potentially accessing these BOs must be evicted temporarily. Once
user mode queues are evicted, the eviction fence is signaled,
allowing the migration of the BO to proceed.

A delayed worker is scheduled to restore all the BOs belonging to
the evicted process and restart its queues.

During suspend/resume of the GPU we also evict all processes to allow
KGD to save BOs in system memory, since VRAM will be lost.

v2:
* Account for eviction when updating of q->is_active in MQD manager

Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
  • Loading branch information
Felix Kuehling authored and Oded Gabbay committed Feb 7, 2018
1 parent 403575c commit 2610343
Show file tree
Hide file tree
Showing 8 changed files with 547 additions and 8 deletions.
65 changes: 64 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "kfd_iommu.h"

#define MQD_SIZE_ALIGNED 768
static atomic_t kfd_device_suspended = ATOMIC_INIT(0);

#ifdef KFD_SUPPORT_IOMMU_V2
static const struct kfd_device_info kaveri_device_info = {
Expand Down Expand Up @@ -469,18 +470,32 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
if (!kfd->init_complete)
return;

/* For first KFD device suspend all the KFD processes */
if (atomic_inc_return(&kfd_device_suspended) == 1)
kfd_suspend_all_processes();

kfd->dqm->ops.stop(kfd->dqm);

kfd_iommu_suspend(kfd);
}

int kgd2kfd_resume(struct kfd_dev *kfd)
{
int ret, count;

if (!kfd->init_complete)
return 0;

return kfd_resume(kfd);
ret = kfd_resume(kfd);
if (ret)
return ret;

count = atomic_dec_return(&kfd_device_suspended);
WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
if (count == 0)
ret = kfd_resume_all_processes();

return ret;
}

static int kfd_resume(struct kfd_dev *kfd)
Expand Down Expand Up @@ -526,6 +541,54 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
spin_unlock(&kfd->interrupt_lock);
}

/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will
* prepare for safe eviction of KFD BOs that belong to the specified
* process.
*
* @mm: mm_struct that identifies the specified KFD process
* @fence: eviction fence attached to KFD process BOs
*
*/
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence)
{
struct kfd_process *p;
unsigned long active_time;
unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS);

if (!fence)
return -EINVAL;

if (dma_fence_is_signaled(fence))
return 0;

p = kfd_lookup_process_by_mm(mm);
if (!p)
return -ENODEV;

if (fence->seqno == p->last_eviction_seqno)
goto out;

p->last_eviction_seqno = fence->seqno;

/* Avoid KFD process starvation. Wait for at least
* PROCESS_ACTIVE_TIME_MS before evicting the process again
*/
active_time = get_jiffies_64() - p->last_restore_timestamp;
if (delay_jiffies > active_time)
delay_jiffies -= active_time;
else
delay_jiffies = 0;

/* During process initialization eviction_work.dwork is initialized
* to kfd_evict_bo_worker
*/
schedule_delayed_work(&p->eviction_work, delay_jiffies);
out:
kfd_unref_process(p);
return 0;
}

static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
unsigned int chunk_size)
{
Expand Down
219 changes: 218 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@
*
*/

#include <linux/ratelimit.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/printk.h>
#include <linux/bitops.h>
#include <linux/sched.h>
#include "kfd_priv.h"
Expand Down Expand Up @@ -180,6 +181,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
goto out_unlock;
}
q->properties.vmid = qpd->vmid;
/*
* Eviction state logic: we only mark active queues as evicted
* to avoid the overhead of restoring inactive queues later
*/
if (qpd->evicted)
q->properties.is_evicted = (q->properties.queue_size > 0 &&
q->properties.queue_percent > 0 &&
q->properties.queue_address != 0);

q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
Expand Down Expand Up @@ -377,15 +386,29 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
{
int retval;
struct mqd_manager *mqd;
struct kfd_process_device *pdd;
bool prev_active = false;

mutex_lock(&dqm->lock);
pdd = kfd_get_process_device_data(q->device, q->process);
if (!pdd) {
retval = -ENODEV;
goto out_unlock;
}
mqd = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
if (!mqd) {
retval = -ENOMEM;
goto out_unlock;
}
/*
* Eviction state logic: we only mark active queues as evicted
* to avoid the overhead of restoring inactive queues later
*/
if (pdd->qpd.evicted)
q->properties.is_evicted = (q->properties.queue_size > 0 &&
q->properties.queue_percent > 0 &&
q->properties.queue_address != 0);

/* Save previous activity state for counters */
prev_active = q->properties.is_active;
Expand Down Expand Up @@ -457,6 +480,187 @@ static struct mqd_manager *get_mqd_manager(
return mqd;
}

static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct mqd_manager *mqd;
struct kfd_process_device *pdd;
int retval = 0;

mutex_lock(&dqm->lock);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
goto out;

pdd = qpd_to_pdd(qpd);
pr_info_ratelimited("Evicting PASID %u queues\n",
pdd->process->pasid);

/* unactivate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
if (!q->properties.is_active)
continue;
mqd = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
if (!mqd) { /* should not be here */
pr_err("Cannot evict queue, mqd mgr is NULL\n");
retval = -ENOMEM;
goto out;
}
q->properties.is_evicted = true;
q->properties.is_active = false;
retval = mqd->destroy_mqd(mqd, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
if (retval)
goto out;
dqm->queue_count--;
}

out:
mutex_unlock(&dqm->lock);
return retval;
}

static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct kfd_process_device *pdd;
int retval = 0;

mutex_lock(&dqm->lock);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
goto out;

pdd = qpd_to_pdd(qpd);
pr_info_ratelimited("Evicting PASID %u queues\n",
pdd->process->pasid);

/* unactivate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
if (!q->properties.is_active)
continue;
q->properties.is_evicted = true;
q->properties.is_active = false;
dqm->queue_count--;
}
retval = execute_queues_cpsch(dqm,
qpd->is_debug ?
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);

out:
mutex_unlock(&dqm->lock);
return retval;
}

static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct mqd_manager *mqd;
struct kfd_process_device *pdd;
uint32_t pd_base;
int retval = 0;

pdd = qpd_to_pdd(qpd);
/* Retrieve PD base */
pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);

mutex_lock(&dqm->lock);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
goto out;
if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
qpd->evicted--;
goto out;
}

pr_info_ratelimited("Restoring PASID %u queues\n",
pdd->process->pasid);

/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%08x\n", pd_base);

if (!list_empty(&qpd->queues_list)) {
dqm->dev->kfd2kgd->set_vm_context_page_table_base(
dqm->dev->kgd,
qpd->vmid,
qpd->page_table_base);
kfd_flush_tlb(pdd);
}

/* activate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
if (!q->properties.is_evicted)
continue;
mqd = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
if (!mqd) { /* should not be here */
pr_err("Cannot restore queue, mqd mgr is NULL\n");
retval = -ENOMEM;
goto out;
}
q->properties.is_evicted = false;
q->properties.is_active = true;
retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
q->queue, &q->properties,
q->process->mm);
if (retval)
goto out;
dqm->queue_count++;
}
qpd->evicted = 0;
out:
mutex_unlock(&dqm->lock);
return retval;
}

static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct kfd_process_device *pdd;
uint32_t pd_base;
int retval = 0;

pdd = qpd_to_pdd(qpd);
/* Retrieve PD base */
pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);

mutex_lock(&dqm->lock);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
goto out;
if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
qpd->evicted--;
goto out;
}

pr_info_ratelimited("Restoring PASID %u queues\n",
pdd->process->pasid);

/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%08x\n", pd_base);

/* activate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
if (!q->properties.is_evicted)
continue;
q->properties.is_evicted = false;
q->properties.is_active = true;
dqm->queue_count++;
}
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (!retval)
qpd->evicted = 0;
out:
mutex_unlock(&dqm->lock);
return retval;
}

static int register_process(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
Expand Down Expand Up @@ -853,6 +1057,14 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
retval = -ENOMEM;
goto out;
}
/*
* Eviction state logic: we only mark active queues as evicted
* to avoid the overhead of restoring inactive queues later
*/
if (qpd->evicted)
q->properties.is_evicted = (q->properties.queue_size > 0 &&
q->properties.queue_percent > 0 &&
q->properties.queue_address != 0);

dqm->asic_ops.init_sdma_vm(dqm, q, qpd);

Expand Down Expand Up @@ -1291,6 +1503,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
dqm->ops.set_trap_handler = set_trap_handler;
dqm->ops.process_termination = process_termination_cpsch;
dqm->ops.evict_process_queues = evict_process_queues_cpsch;
dqm->ops.restore_process_queues = restore_process_queues_cpsch;
break;
case KFD_SCHED_POLICY_NO_HWS:
/* initialize dqm for no cp scheduling */
Expand All @@ -1307,6 +1521,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
dqm->ops.set_trap_handler = set_trap_handler;
dqm->ops.process_termination = process_termination_nocpsch;
dqm->ops.evict_process_queues = evict_process_queues_nocpsch;
dqm->ops.restore_process_queues =
restore_process_queues_nocpsch;
break;
default:
pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
Expand Down
9 changes: 9 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ struct device_process_node {
*
* @process_termination: Clears all process queues belongs to that device.
*
* @evict_process_queues: Evict all active queues of a process
*
* @restore_process_queues: Restore all evicted queues queues of a process
*
*/

struct device_queue_manager_ops {
Expand Down Expand Up @@ -129,6 +133,11 @@ struct device_queue_manager_ops {

int (*process_termination)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);

int (*evict_process_queues)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
int (*restore_process_queues)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
};

struct device_queue_manager_asic_ops {
Expand Down
Loading

0 comments on commit 2610343

Please sign in to comment.