Skip to content

Commit

Permalink
drm/amdkfd: Handle VM faults in KFD
Browse files Browse the repository at this point in the history
1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
   per-vmid. amdkfd needs to get the information from amdgpu through the
   new get_vm_fault_info interface. On GFX9 and later, all the required
   information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
   run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY

Signed-off-by: shaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
  • Loading branch information
shaoyunl authored and Oded Gabbay committed Jul 12, 2018
1 parent b97dfa2 commit 2640c3f
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 7 deletions.
25 changes: 21 additions & 4 deletions drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
}

static void cik_event_interrupt_wq(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
{
unsigned int pasid;
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
uint32_t context_id = ihre->data & 0xfffffff;

pasid = (ihre->ring_id & 0xffff0000) >> 16;
unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8;
unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;

if (pasid == 0)
return;
Expand All @@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
kfd_signal_hw_exception_event(pasid);
else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;

kfd_process_vm_fault(dev->dqm, pasid);

memset(&info, 0, sizeof(info));
dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
if (!info.page_addr && !info.status)
return;

if (info.vmid == vmid)
kfd_signal_vm_fault_event(dev, pasid, &info);
else
kfd_signal_vm_fault_event(dev, pasid, NULL);
}
}

const struct kfd_event_interrupt_class event_interrupt_class_cik = {
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/cik_int.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6
#define CIK_INTSRC_SDMA_TRAP 0xE0
#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF
#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92
#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93

#endif

17 changes: 17 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
kfree(dqm);
}

int kfd_process_vm_fault(struct device_queue_manager *dqm,
unsigned int pasid)
{
struct kfd_process_device *pdd;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
int ret = 0;

if (!p)
return -EINVAL;
pdd = kfd_get_process_device_data(dqm->dev, p);
if (pdd)
ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
kfd_unref_process(p);

return ret;
}

#if defined(CONFIG_DEBUG_FS)

static void seq_reg_dump(struct seq_file *m,
Expand Down
37 changes: 37 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_events.c
Original file line number Diff line number Diff line change
Expand Up @@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
mutex_unlock(&p->event_mutex);
kfd_unref_process(p);
}

void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
struct kfd_vm_fault_info *info)
{
struct kfd_event *ev;
uint32_t id;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
struct kfd_hsa_memory_exception_data memory_exception_data;

if (!p)
return; /* Presumably process exited. */
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.gpu_id = dev->id;
memory_exception_data.failure.imprecise = 1;
/* Set failure reason */
if (info) {
memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
memory_exception_data.failure.NotPresent =
info->prot_valid ? 1 : 0;
memory_exception_data.failure.NoExecute =
info->prot_exec ? 1 : 0;
memory_exception_data.failure.ReadOnly =
info->prot_write ? 1 : 0;
memory_exception_data.failure.imprecise = 0;
}
mutex_lock(&p->event_mutex);

id = KFD_FIRST_NONSIGNAL_EVENT_ID;
idr_for_each_entry_continue(&p->event_idr, ev, id)
if (ev->type == KFD_EVENT_TYPE_MEMORY) {
ev->memory_exception_data = memory_exception_data;
set_event(ev);
}

mutex_unlock(&p->event_mutex);
kfd_unref_process(p);
}
18 changes: 16 additions & 2 deletions drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SDMA_TRAP ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
source_id == SOC15_INTSRC_CP_BAD_OPCODE;
source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_UTCL2;
}

static void event_interrupt_wq_v9(struct kfd_dev *dev,
Expand All @@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
kfd_signal_hw_exception_event(pasid);
else if (client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_UTCL2) {
/* TODO */
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);

info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
info.prot_valid = ring_id & 0x08;
info.prot_read = ring_id & 0x10;
info.prot_write = ring_id & 0x20;

kfd_process_vm_fault(dev->dqm, pasid);
kfd_signal_vm_fault_event(dev, pasid, &info);
}
}

Expand Down
4 changes: 4 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq);
int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);

/* Process Queue Manager */
struct process_queue_node {
Expand Down Expand Up @@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint64_t *event_page_offset, uint32_t *event_slot_index);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);

void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
struct kfd_vm_fault_info *info);

void kfd_flush_tlb(struct kfd_process_device *pdd);

int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
Expand Down
2 changes: 1 addition & 1 deletion include/uapi/linux/kfd_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
__u32 NotPresent; /* Page not present or supervisor privilege */
__u32 ReadOnly; /* Write access to a read-only page */
__u32 NoExecute; /* Execute access to a page marked NX */
__u32 pad;
__u32 imprecise; /* Can't determine the exact fault address */
};

/* memory exception data*/
Expand Down

0 comments on commit 2640c3f

Please sign in to comment.