Skip to content

Commit

Permalink
drm/amdkfd: allow users to target recommended SDMA engines
Browse files Browse the repository at this point in the history
Certain GPUs have better copy performance over xGMI on specific
SDMA engines depending on the source and destination GPU.
Allow users to create SDMA queues on these recommended engines.
Close to 2x overall performance has been observed with this
optimization.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Jonathan Kim authored and Alex Deucher committed Jul 25, 2024
1 parent 60c30ba commit e06b71b
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 3 deletions.
16 changes: 16 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
Original file line number Diff line number Diff line change
Expand Up @@ -255,13 +255,16 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
args->ctx_save_restore_address;
q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
q_properties->ctl_stack_size = args->ctl_stack_size;
q_properties->sdma_engine_id = args->sdma_engine_id;
if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA)
q_properties->type = KFD_QUEUE_TYPE_SDMA;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
else
return -ENOTSUPP;

Expand Down Expand Up @@ -333,6 +336,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
goto err_bind_process;
}

if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
kfd_get_num_xgmi_sdma_engines(dev) - 1;

if (q_properties.sdma_engine_id > max_sdma_eng_id) {
err = -EINVAL;
pr_err("sdma_engine_id %i exceeds maximum id of %i\n",
q_properties.sdma_engine_id, max_sdma_eng_id);
goto err_sdma_engine_id;
}
}

if (!pdd->qpd.proc_doorbells) {
err = kfd_alloc_process_doorbells(dev->kfd, pdd);
if (err) {
Expand Down Expand Up @@ -387,6 +402,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
err_create_queue:
kfd_queue_release_buffers(pdd, &q_properties);
err_acquire_queue_buf:
err_sdma_engine_id:
err_bind_process:
err_pdd:
mutex_unlock(&p->mutex);
Expand Down
38 changes: 37 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1532,6 +1532,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
kfd_get_num_xgmi_sdma_engines(dqm->dev);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
int i, num_queues, num_engines, eng_offset = 0, start_engine;
bool free_bit_found = false, is_xgmi = false;

if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) {
num_queues = get_num_sdma_queues(dqm);
num_engines = kfd_get_num_sdma_engines(dqm->dev);
q->properties.type = KFD_QUEUE_TYPE_SDMA;
} else {
num_queues = get_num_xgmi_sdma_queues(dqm);
num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev);
eng_offset = kfd_get_num_sdma_engines(dqm->dev);
q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI;
is_xgmi = true;
}

/* Scan available bit based on target engine ID. */
start_engine = q->properties.sdma_engine_id - eng_offset;
for (i = start_engine; i < num_queues; i += num_engines) {

if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap))
continue;

clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap);
q->sdma_id = i;
q->properties.sdma_queue_id = q->sdma_id / num_engines;
free_bit_found = true;
break;
}

if (!free_bit_found) {
dev_err(dev, "No more SDMA queue to allocate for target ID %i\n",
q->properties.sdma_engine_id);
return -ENOMEM;
}
}

pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
Expand Down Expand Up @@ -1784,7 +1819,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
}

if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
dqm_lock(dqm);
retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL);
dqm_unlock(dqm);
Expand Down
5 changes: 4 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -414,13 +414,16 @@ enum kfd_unmap_queues_filter {
* @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
*
* @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface.
*
* @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: SDMA user mode queue with target SDMA engine ID.
*/
enum kfd_queue_type {
KFD_QUEUE_TYPE_COMPUTE,
KFD_QUEUE_TYPE_SDMA,
KFD_QUEUE_TYPE_HIQ,
KFD_QUEUE_TYPE_DIQ,
KFD_QUEUE_TYPE_SDMA_XGMI
KFD_QUEUE_TYPE_SDMA_XGMI,
KFD_QUEUE_TYPE_SDMA_BY_ENG_ID
};

enum kfd_queue_format {
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
switch (type) {
case KFD_QUEUE_TYPE_SDMA:
case KFD_QUEUE_TYPE_SDMA_XGMI:
case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:
/* SDMA queues are always allocated statically no matter
* which scheduler mode is used. We also do not need to
* check whether a SDMA queue can be allocated here, because
Expand Down
52 changes: 52 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_topology.c
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr,
iolink->max_bandwidth);
sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size",
iolink->rec_transfer_size);
sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask",
iolink->rec_sdma_eng_id_mask);
sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags);

return offs;
Expand Down Expand Up @@ -1265,6 +1267,55 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev,
}
}

#define REC_SDMA_NUM_GPU 8
static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = {
{ -1, 14, 12, 2, 4, 8, 10, 6 },
{ 14, -1, 2, 10, 8, 4, 6, 12 },
{ 10, 2, -1, 12, 14, 6, 4, 8 },
{ 2, 12, 10, -1, 6, 14, 8, 4 },
{ 4, 8, 14, 6, -1, 10, 12, 2 },
{ 8, 4, 6, 14, 12, -1, 2, 10 },
{ 10, 6, 4, 8, 12, 2, -1, 14 },
{ 6, 12, 8, 4, 2, 10, 14, -1 }};

static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev,
struct kfd_iolink_properties *outbound_link,
struct kfd_iolink_properties *inbound_link)
{
struct kfd_node *gpu = outbound_link->gpu;
struct amdgpu_device *adev = gpu->adev;
int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu &&
adev->aid_mask && num_xgmi_nodes &&
(amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) ==
AMDGPU_SPX_PARTITION_MODE) &&
(!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8);

if (support_rec_eng) {
int src_socket_id = adev->gmc.xgmi.physical_node_id;
int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id;

outbound_link->rec_sdma_eng_id_mask =
1 << rec_sdma_eng_map[src_socket_id][dst_socket_id];
inbound_link->rec_sdma_eng_id_mask =
1 << rec_sdma_eng_map[dst_socket_id][src_socket_id];
} else {
int num_sdma_eng = kfd_get_num_sdma_engines(gpu);
int i, eng_offset = 0;

if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI &&
kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) {
eng_offset = num_sdma_eng;
num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu);
}

for (i = 0; i < num_sdma_eng; i++) {
outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
}
}
}

static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
{
struct kfd_iolink_properties *link, *inbound_link;
Expand Down Expand Up @@ -1303,6 +1354,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED;
kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link);
kfd_set_iolink_non_coherent(peer_dev, link, inbound_link);
kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link);
}
}

Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ struct kfd_iolink_properties {
uint32_t min_bandwidth;
uint32_t max_bandwidth;
uint32_t rec_transfer_size;
uint32_t rec_sdma_eng_id_mask;
uint32_t flags;
struct kfd_node *gpu;
struct kobject *kobj;
Expand Down
6 changes: 5 additions & 1 deletion include/uapi/linux/kfd_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@
* - 1.14 - Update kfd_event_data
* - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
* - 1.16 - Add contiguous VRAM allocation flag
* - 1.17 - Add SDMA queue creation with target SDMA engine ID
*/
#define KFD_IOCTL_MAJOR_VERSION 1
#define KFD_IOCTL_MINOR_VERSION 16
#define KFD_IOCTL_MINOR_VERSION 17

struct kfd_ioctl_get_version_args {
__u32 major_version; /* from KFD */
Expand All @@ -56,6 +57,7 @@ struct kfd_ioctl_get_version_args {
#define KFD_IOC_QUEUE_TYPE_SDMA 0x1
#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2
#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3
#define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID 0x4

#define KFD_MAX_QUEUE_PERCENTAGE 100
#define KFD_MAX_QUEUE_PRIORITY 15
Expand All @@ -78,6 +80,8 @@ struct kfd_ioctl_create_queue_args {
__u64 ctx_save_restore_address; /* to KFD */
__u32 ctx_save_restore_size; /* to KFD */
__u32 ctl_stack_size; /* to KFD */
__u32 sdma_engine_id; /* to KFD */
__u32 pad;
};

struct kfd_ioctl_destroy_queue_args {
Expand Down

0 comments on commit e06b71b

Please sign in to comment.