Skip to content

Commit

Permalink
drm/amdkfd: fix zero reading of VMID and PASID for Hawaii
Browse files Browse the repository at this point in the history
Upon VM Fault, the VMID and PASID written by HW are zeros in
Hawaii. Instead of reading from ih_ring_entry, read directly
from the registers. This workaround fix the soft hang issues
caused by mishandled VM Fault in Hawaii.

Signed-off-by: Lan Xiao <Lan.Xiao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
  • Loading branch information
Lan Xiao authored and Oded Gabbay committed Jul 12, 2018
1 parent 2640c3f commit 58e6988
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 10 deletions.
20 changes: 19 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
uint32_t page_table_base);
static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid);
static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd);

/* Because of REG_GET_FIELD() being used, we put this function in the
* asic specific file.
Expand Down Expand Up @@ -216,7 +217,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
.invalidate_tlbs = invalidate_tlbs,
.invalidate_tlbs_vmid = invalidate_tlbs_vmid,
.submit_ib = amdgpu_amdkfd_submit_ib,
.get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info
.get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
.read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg
};

struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
Expand Down Expand Up @@ -912,3 +914,19 @@ static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid)
RREG32(mmVM_INVALIDATE_RESPONSE);
return 0;
}

/**
* read_vmid_from_vmfault_reg - read vmid from register
*
* adev: amdgpu_device pointer
* @vmid: vmid pointer
* read vmid from register (CIK).
*/
static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd)
{
struct amdgpu_device *adev = get_amdgpu_device(kgd);

uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS);

return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID);
}
29 changes: 28 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,39 @@
#include "cik_int.h"

static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre,
bool *patched_flag)
{
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
unsigned int vmid, pasid;

/* This workaround is due to HW/FW limitation on Hawaii that
* VMID and PASID are not written into ih_ring_entry
*/
if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
dev->device_info->asic_family == CHIP_HAWAII) {
struct cik_ih_ring_entry *tmp_ihre =
(struct cik_ih_ring_entry *)patched_ihre;

*patched_flag = true;
*tmp_ihre = *ihre;

vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid);

tmp_ihre->ring_id &= 0x000000ff;
tmp_ihre->ring_id |= vmid << 8;
tmp_ihre->ring_id |= pasid << 16;

return (pasid != 0) &&
vmid >= dev->vm_info.first_vmid_kfd &&
vmid <= dev->vm_info.last_vmid_kfd;
}

/* Only handle interrupts from KFD VMIDs */
vmid = (ihre->ring_id & 0x0000ff00) >> 8;
if (vmid < dev->vm_info.first_vmid_kfd ||
Expand Down
14 changes: 12 additions & 2 deletions drivers/gpu/drm/amd/amdkfd/kfd_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -577,14 +577,24 @@ static int kfd_resume(struct kfd_dev *kfd)
/* This is called directly from KGD at ISR. */
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
{
uint32_t patched_ihre[KFD_MAX_RING_ENTRY_SIZE];
bool is_patched = false;

if (!kfd->init_complete)
return;

if (kfd->device_info->ih_ring_entry_size > sizeof(patched_ihre)) {
dev_err_once(kfd_device, "Ring entry too small\n");
return;
}

spin_lock(&kfd->interrupt_lock);

if (kfd->interrupts_active
&& interrupt_is_wanted(kfd, ih_ring_entry)
&& enqueue_ih_ring_entry(kfd, ih_ring_entry))
&& interrupt_is_wanted(kfd, ih_ring_entry,
patched_ihre, &is_patched)
&& enqueue_ih_ring_entry(kfd,
is_patched ? patched_ihre : ih_ring_entry))
queue_work(kfd->ih_wq, &kfd->interrupt_work);

spin_unlock(&kfd->interrupt_lock);
Expand Down
4 changes: 3 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@


static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre,
bool *patched_flag)
{
uint16_t source_id, client_id, pasid, vmid;
const uint32_t *data = ih_ring_entry;
Expand Down
6 changes: 4 additions & 2 deletions drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,15 @@ static void interrupt_wq(struct work_struct *work)
ih_ring_entry);
}

bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry)
bool interrupt_is_wanted(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre, bool *flag)
{
/* integer and bitwise OR so there is no boolean short-circuiting */
unsigned int wanted = 0;

wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev,
ih_ring_entry);
ih_ring_entry, patched_ihre, flag);

return wanted != 0;
}
9 changes: 6 additions & 3 deletions drivers/gpu/drm/amd/amdkfd/kfd_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,10 @@ enum cache_policy {

struct kfd_event_interrupt_class {
bool (*interrupt_isr)(struct kfd_dev *dev,
const uint32_t *ih_ring_entry);
const uint32_t *ih_ring_entry, uint32_t *patched_ihre,
bool *patched_flag);
void (*interrupt_wq)(struct kfd_dev *dev,
const uint32_t *ih_ring_entry);
const uint32_t *ih_ring_entry);
};

struct kfd_device_info {
Expand Down Expand Up @@ -806,7 +807,9 @@ int kfd_interrupt_init(struct kfd_dev *dev);
void kfd_interrupt_exit(struct kfd_dev *dev);
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry);
bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry);
bool interrupt_is_wanted(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre, bool *flag);

/* Power Management */
void kgd2kfd_suspend(struct kfd_dev *kfd);
Expand Down
5 changes: 5 additions & 0 deletions drivers/gpu/drm/amd/include/kgd_kfd_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,10 @@ struct tile_config {
* faults. On GFXv9 VM fault information is fully contained in the IH
* packet and this function is not needed.
*
* @read_vmid_from_vmfault_reg: On Hawaii the VMID is not set in the
* IH ring entry. This function allows the KFD ISR to get the VMID
* from the fault status register as early as possible.
*
* This structure contains function pointers to services that the kgd driver
* provides to amdkfd driver.
*
Expand Down Expand Up @@ -394,6 +398,7 @@ struct kfd2kgd_calls {

int (*get_vm_fault_info)(struct kgd_dev *kgd,
struct kfd_vm_fault_info *info);
uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd);
};

/**
Expand Down

0 comments on commit 58e6988

Please sign in to comment.