Skip to content

Commit

Permalink
drm/amdkfd: prepare per-process debug enable and disable
Browse files Browse the repository at this point in the history
The ROCm debugger will attach to a process to debug by PTRACE and will
expect the KFD to prepare a process for the target PID, whether the
target PID has opened the KFD device or not.

This patch is to explicity handle this requirement.  Further HW mode
setting and runtime coordination requirements will be handled in
following patches.

In the case where the target process has not opened the KFD device,
a new KFD process must be created for the target PID.
The debugger as well as the target process for this case will have not
acquired any VMs so handle process restoration to correctly account for
this.

To coordinate with HSA runtime, the debugger must be aware of the target
process' runtime enablement status and will copy the runtime status
information into the debugged KFD process for later query.

On enablement, the debugger will subscribe to a set of exceptions where
each exception events will notify the debugger through a pollable FIFO
file descriptor that the debugger provides to the KFD to manage.

Finally on process termination of either the debugger or the target,
debugging must be disabled if it has not been done so.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Jonathan Kim authored and Alex Deucher committed Jun 9, 2023
1 parent d230f1b commit 0ab2d75
Show file tree
Hide file tree
Showing 7 changed files with 304 additions and 30 deletions.
3 changes: 2 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
$(AMDKFD_PATH)/kfd_crat.o \
$(AMDKFD_PATH)/kfd_debug.o

ifneq ($(CONFIG_AMD_IOMMU_V2),)
AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
Expand Down
102 changes: 100 additions & 2 deletions drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "amdgpu_amdkfd.h"
#include "kfd_smi_events.h"
#include "amdgpu_dma_buf.h"
#include "kfd_debug.h"

static long kfd_ioctl(struct file *, unsigned int, unsigned long);
static int kfd_open(struct inode *, struct file *);
Expand Down Expand Up @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
return -EPERM;
}

process = kfd_create_process(filep);
process = kfd_create_process(current);
if (IS_ERR(process))
return PTR_ERR(process);

if (kfd_process_init_cwsr_apu(process, filep)) {
kfd_unref_process(process);
return -EFAULT;
}

/* filep now owns the reference returned by kfd_create_process */
filep->private_data = process;

Expand Down Expand Up @@ -2737,16 +2743,92 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
{
struct kfd_ioctl_dbg_trap_args *args = data;
struct task_struct *thread = NULL;
struct mm_struct *mm = NULL;
struct pid *pid = NULL;
struct kfd_process *target = NULL;
int r = 0;

if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
pr_err("Debugging does not support sched_policy %i", sched_policy);
return -EINVAL;
}

pid = find_get_pid(args->pid);
if (!pid) {
pr_debug("Cannot find pid info for %i\n", args->pid);
r = -ESRCH;
goto out;
}

thread = get_pid_task(pid, PIDTYPE_PID);
if (!thread) {
r = -ESRCH;
goto out;
}

mm = get_task_mm(thread);
if (!mm) {
r = -ESRCH;
goto out;
}

if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
bool create_process;

rcu_read_lock();
create_process = thread && thread != current && ptrace_parent(thread) == current;
rcu_read_unlock();

target = create_process ? kfd_create_process(thread) :
kfd_lookup_process_by_pid(pid);
} else {
target = kfd_lookup_process_by_pid(pid);
}

if (!target) {
pr_debug("Cannot find process PID %i to debug\n", args->pid);
r = -ESRCH;
goto out;
}

/* Check if target is still PTRACED. */
rcu_read_lock();
if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
&& ptrace_parent(target->lead_thread) != current) {
pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
r = -EPERM;
}
rcu_read_unlock();

if (r)
goto out;

mutex_lock(&target->mutex);

if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
r = -EINVAL;
goto unlock_out;
}

switch (args->op) {
case KFD_IOC_DBG_TRAP_ENABLE:
if (target != p)
target->debugger_process = p;

r = kfd_dbg_trap_enable(target,
args->enable.dbg_fd,
(void __user *)args->enable.rinfo_ptr,
&args->enable.rinfo_size);
if (!r)
target->exception_enable_mask = args->enable.exception_mask;

pr_warn("Debug functions limited\n");
break;
case KFD_IOC_DBG_TRAP_DISABLE:
r = kfd_dbg_trap_disable(target);
break;
case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
Expand All @@ -2760,14 +2842,30 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
pr_warn("Debugging not supported yet\n");
pr_warn("Debug op %i not supported yet\n", args->op);
r = -EACCES;
break;
default:
pr_err("Invalid option: %i\n", args->op);
r = -EINVAL;
}

unlock_out:
mutex_unlock(&target->mutex);

out:
if (thread)
put_task_struct(thread);

if (mm)
mmput(mm);

if (pid)
put_pid(pid);

if (target)
kfd_unref_process(target);

return r;
}

Expand Down
80 changes: 80 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_debug.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

#include "kfd_debug.h"
#include <linux/file.h>

int kfd_dbg_trap_disable(struct kfd_process *target)
{
if (!target->debug_trap_enabled)
return 0;

fput(target->dbg_ev_file);
target->dbg_ev_file = NULL;

if (target->debugger_process) {
atomic_dec(&target->debugger_process->debugged_process_count);
target->debugger_process = NULL;
}

target->debug_trap_enabled = false;
kfd_unref_process(target);

return 0;
}

int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info, uint32_t *runtime_size)
{
struct file *f;
uint32_t copy_size;
int r = 0;

if (target->debug_trap_enabled)
return -EALREADY;

copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));

f = fget(fd);
if (!f) {
pr_err("Failed to get file for (%i)\n", fd);
return -EBADF;
}

target->dbg_ev_file = f;

/* We already hold the process reference but hold another one for the
* debug session.
*/
kref_get(&target->ref);
target->debug_trap_enabled = true;

if (target->debugger_process)
atomic_inc(&target->debugger_process->debugged_process_count);

if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
r = -EFAULT;

*runtime_size = sizeof(target->runtime_info);

return r;
}
32 changes: 32 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_debug.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
#define KFD_DEBUG_EVENTS_H_INCLUDED

#include "kfd_priv.h"

int kfd_dbg_trap_disable(struct kfd_process *target);
int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info,
uint32_t *runtime_info_size);
#endif
26 changes: 20 additions & 6 deletions drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
goto out;

pdd = qpd_to_pdd(qpd);

/* The debugger creates processes that temporarily have not acquired
* all VMs for all devices and has no VMs itself.
* Skip queue eviction on process eviction.
*/
if (!pdd->drm_priv)
goto out;

pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
pdd->process->pasid);

Expand Down Expand Up @@ -1127,13 +1135,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
{
struct queue *q;
struct kfd_process_device *pdd;
uint64_t pd_base;
uint64_t eviction_duration;
int retval = 0;

pdd = qpd_to_pdd(qpd);
/* Retrieve PD base */
pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);

dqm_lock(dqm);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
Expand All @@ -1143,12 +1148,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
goto out;
}

/* The debugger creates processes that temporarily have not acquired
* all VMs for all devices and has no VMs itself.
* Skip queue restore on process restore.
*/
if (!pdd->drm_priv)
goto vm_not_acquired;

pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
pdd->process->pasid);

/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%llx\n", pd_base);
qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);

/* activate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
Expand All @@ -1171,9 +1183,11 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
qpd->evicted = 0;

eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
atomic64_add(eviction_duration, &pdd->evict_duration_counter);
vm_not_acquired:
qpd->evicted = 0;
out:
dqm_unlock(dqm);
return retval;
Expand Down
Loading

0 comments on commit 0ab2d75

Please sign in to comment.