Skip to content

Commit

Permalink
drm/amdkfd: Have kfd driver use same PASID values from graphic driver
Browse files Browse the repository at this point in the history
Current kfd driver has its own PASID value for a kfd process and uses it to
locate vm at interrupt handler or mapping between kfd process and vm. That
design is not working when a physical gpu device has multiple spatial
partitions, ex: adev in CPX mode. This patch has kfd driver use same pasid
values that graphic driver generated which is per vm per pasid.

These pasid values are passed to fw/hardware. We do not need change interrupt
handler though more pasid values are used. Also, pasid values at log are
replaced by user process pid; pasid values are not exposed to user. Users see
their process pids that have meaning in user space.

Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Xiaogang Chen authored and Alex Deucher committed Feb 13, 2025
1 parent ca44922 commit 8544374
Show file tree
Hide file tree
Showing 16 changed files with 196 additions and 181 deletions.
3 changes: 1 addition & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
};

struct amdgpu_device;
struct kfd_process_device;
struct amdgpu_reset_context;

enum kfd_mem_attachment_type {
Expand Down Expand Up @@ -299,8 +300,6 @@ bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id);
(&((struct amdgpu_fpriv *) \
((struct drm_file *)(drm_priv))->driver_priv)->vm)

int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
struct amdgpu_vm *avm, u32 pasid);
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
struct amdgpu_vm *avm,
void **process_info,
Expand Down
21 changes: 0 additions & 21 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1529,27 +1529,6 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
amdgpu_bo_unreserve(bo);
}

int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
struct amdgpu_vm *avm, u32 pasid)

{
int ret;

/* Free the original amdgpu allocated pasid,
* will be replaced with kfd allocated pasid.
*/
if (avm->pasid) {
amdgpu_pasid_free(avm->pasid);
amdgpu_vm_set_pasid(adev, avm, 0);
}

ret = amdgpu_vm_set_pasid(adev, avm, pasid);
if (ret)
return ret;

return 0;
}

int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
struct amdgpu_vm *avm,
void **process_info,
Expand Down
18 changes: 14 additions & 4 deletions drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,20 +107,30 @@ static void cik_event_interrupt_wq(struct kfd_node *dev,
kfd_signal_hw_exception_event(pasid);
else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_process_device *pdd = NULL;
struct kfd_vm_fault_info info;
struct kfd_process *p;

kfd_smi_event_update_vmfault(dev, pasid);
kfd_dqm_evict_pasid(dev->dqm, pasid);
p = kfd_lookup_process_by_pasid(pasid, &pdd);
if (!pdd)
return;

kfd_evict_process_device(pdd);

memset(&info, 0, sizeof(info));
amdgpu_amdkfd_gpuvm_get_vm_fault_info(dev->adev, &info);
if (!info.page_addr && !info.status)
if (!info.page_addr && !info.status) {
kfd_unref_process(p);
return;
}

if (info.vmid == vmid)
kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
kfd_signal_vm_fault_event(pdd, &info, NULL);
else
kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
kfd_signal_vm_fault_event(pdd, &info, NULL);

kfd_unref_process(p);
}
}

Expand Down
25 changes: 13 additions & 12 deletions drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ static int kfd_open(struct inode *inode, struct file *filep)
/* filep now owns the reference returned by kfd_create_process */
filep->private_data = process;

dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
process->pasid, process->is_32bit_user_mode);
dev_dbg(kfd_device, "process pid %d opened kfd node, compat mode (32 bit) - %d\n",
process->lead_thread->pid, process->is_32bit_user_mode);

return 0;
}
Expand Down Expand Up @@ -361,8 +361,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
goto err_acquire_queue_buf;
}

pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
p->pasid,
pr_debug("Creating queue for process pid %d on gpu 0x%x\n",
p->lead_thread->pid,
dev->id);

err = pqm_create_queue(&p->pqm, dev, &q_properties, &queue_id,
Expand Down Expand Up @@ -415,9 +415,9 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
int retval;
struct kfd_ioctl_destroy_queue_args *args = data;

pr_debug("Destroying queue id %d for pasid 0x%x\n",
pr_debug("Destroying queue id %d for process pid %d\n",
args->queue_id,
p->pasid);
p->lead_thread->pid);

mutex_lock(&p->mutex);

Expand Down Expand Up @@ -468,8 +468,8 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
properties.pm4_target_xcc = (args->queue_percentage >> 8) & 0xFF;
properties.priority = args->queue_priority;

pr_debug("Updating queue id %d for pasid 0x%x\n",
args->queue_id, p->pasid);
pr_debug("Updating queue id %d for process pid %d\n",
args->queue_id, p->lead_thread->pid);

mutex_lock(&p->mutex);

Expand Down Expand Up @@ -695,7 +695,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
struct kfd_process_device_apertures *pAperture;
int i;

dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
dev_dbg(kfd_device, "get apertures for process pid %d", p->lead_thread->pid);

args->num_of_nodes = 0;

Expand Down Expand Up @@ -747,7 +747,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
int ret;
int i;

dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
dev_dbg(kfd_device, "get apertures for process pid %d",
p->lead_thread->pid);

if (args->num_of_nodes == 0) {
/* Return number of nodes, so that user space can alloacate
Expand Down Expand Up @@ -3365,12 +3366,12 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process,

vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

pr_debug("pasid 0x%x mapping mmio page\n"
pr_debug("process pid %d mapping mmio page\n"
" target user address == 0x%08llX\n"
" physical address == 0x%08llX\n"
" vm_flags == 0x%04lX\n"
" size == 0x%04lX\n",
process->pasid, (unsigned long long) vma->vm_start,
process->lead_thread->pid, (unsigned long long) vma->vm_start,
address, vma->vm_flags, PAGE_SIZE);

return io_remap_pfn_range(vma,
Expand Down
14 changes: 7 additions & 7 deletions drivers/gpu/drm/amd/amdkfd/kfd_debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,12 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
size_t exception_data_size)
{
struct kfd_process *p;
struct kfd_process_device *pdd = NULL;
bool signaled_to_debugger_or_runtime = false;

p = kfd_lookup_process_by_pasid(pasid);
p = kfd_lookup_process_by_pasid(pasid, &pdd);

if (!p)
if (!pdd)
return false;

if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
Expand Down Expand Up @@ -238,9 +239,8 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,

mutex_unlock(&p->mutex);
} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
kfd_dqm_evict_pasid(dev->dqm, p->pasid);
kfd_signal_vm_fault_event(dev, p->pasid, NULL,
exception_data);
kfd_evict_process_device(pdd);
kfd_signal_vm_fault_event(pdd, NULL, exception_data);

signaled_to_debugger_or_runtime = true;
}
Expand Down Expand Up @@ -276,8 +276,8 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
data = (struct kfd_hsa_memory_exception_data *)
pdd->vm_fault_exc_data;

kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
kfd_evict_process_device(pdd);
kfd_signal_vm_fault_event(pdd, NULL, data);
error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
}

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -1565,7 +1565,7 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr
u32 cam_index;

if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) {
p = kfd_lookup_process_by_pasid(entry->pasid);
p = kfd_lookup_process_by_pasid(entry->pasid, NULL);
if (!p)
return true;

Expand Down
Loading

0 comments on commit 8544374

Please sign in to comment.