Skip to content

Commit

Permalink
drm/amdkfd: support per-queue reset on gfx9
Browse files Browse the repository at this point in the history
Support per-queue reset for GFX9.  The recommendation is for the driver
to target reset the HW queue via a SPI MMIO register write.

Since this requires pipe and HW queue info and MEC FW is limited to
doorbell reports of hung queues after an unmap failure, scan the HW
queue slots defined by SET_RESOURCES first to identify the user queue
candidates to reset.

Only signal reset events to processes that have had a queue reset.

If queue reset fails, fall back to GPU reset.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Jonathan Kim authored and Alex Deucher committed Aug 6, 2024
1 parent e89d2fe commit ee0a469
Show file tree
Hide file tree
Showing 16 changed files with 373 additions and 13 deletions.
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,4 +191,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset,
};
4 changes: 3 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
Original file line number Diff line number Diff line change
Expand Up @@ -418,5 +418,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset
};
4 changes: 3 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
Original file line number Diff line number Diff line change
Expand Up @@ -541,5 +541,7 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
kgd_gfx_v9_4_3_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
.set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
.clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch
.clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset
};
16 changes: 16 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,20 @@ static void program_trap_handler_settings(struct amdgpu_device *adev,
unlock_srbm(adev);
}

uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t inst)
{
return 0;
}

uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t inst, unsigned int utimeout)
{
return 0;
}

const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.program_sh_mem_settings = kgd_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
Expand Down Expand Up @@ -1097,4 +1111,6 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
.program_trap_handler_settings = program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v10_hqd_reset
};
9 changes: 9 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,12 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
uint32_t grace_period,
uint32_t *reg_offset,
uint32_t *reg_data);
uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev,
uint32_t pipe_id,
uint32_t queue_id,
uint32_t inst);
uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
uint32_t pipe_id,
uint32_t queue_id,
uint32_t inst,
unsigned int utimeout);
4 changes: 3 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
Original file line number Diff line number Diff line change
Expand Up @@ -680,5 +680,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
.set_address_watch = kgd_gfx_v10_set_address_watch,
.clear_address_watch = kgd_gfx_v10_clear_address_watch
.clear_address_watch = kgd_gfx_v10_clear_address_watch,
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v10_hqd_reset
};
18 changes: 17 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,20 @@ static uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev,
return 0;
}

static uint64_t kgd_gfx_v11_hqd_get_pq_addr(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t inst)
{
return 0;
}

static uint64_t kgd_gfx_v11_hqd_reset(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t inst, unsigned int utimeout)
{
return 0;
}

const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.program_sh_mem_settings = program_sh_mem_settings_v11,
.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
Expand All @@ -808,5 +822,7 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode,
.set_address_watch = kgd_gfx_v11_set_address_watch,
.clear_address_watch = kgd_gfx_v11_clear_address_watch
.clear_address_watch = kgd_gfx_v11_clear_address_watch,
.hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v11_hqd_reset
};
85 changes: 85 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,89 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
kgd_gfx_v9_unlock_srbm(adev, inst);
}

uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t inst)
{
uint32_t low, high;
uint64_t queue_addr = 0;

kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);

if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE))
goto unlock_out;

low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE);
high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI);

/* only concerned with user queues. */
if (!high)
goto unlock_out;

queue_addr = (((queue_addr | high) << 32) | low) << 8;

unlock_out:
amdgpu_gfx_rlc_exit_safe_mode(adev, inst);
kgd_gfx_v9_release_queue(adev, inst);

return queue_addr;
}

uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t inst, unsigned int utimeout)
{
uint32_t low, high, temp;
unsigned long end_jiffies;
uint64_t queue_addr = 0;

kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);

if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE))
goto unlock_out;

low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE);
high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI);

/* only concerned with user queues. */
if (!high)
goto unlock_out;

queue_addr = (((queue_addr | high) << 32) | low) << 8;

pr_debug("Attempting queue reset on XCC %i pipe id %i queue id %i\n",
inst, pipe_id, queue_id);

/* assume previous dequeue request issued will take affect after reset */
WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1);

end_jiffies = (utimeout * HZ / 1000) + jiffies;
while (true) {
temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);

if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
break;

if (time_after(jiffies, end_jiffies)) {
queue_addr = 0;
break;
}

usleep_range(500, 1000);
}

pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");

unlock_out:
amdgpu_gfx_rlc_exit_safe_mode(adev, inst);
kgd_gfx_v9_release_queue(adev, inst);

return queue_addr;
}

const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
Expand Down Expand Up @@ -1172,4 +1255,6 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset
};
9 changes: 9 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,12 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
uint32_t grace_period,
uint32_t *reg_offset,
uint32_t *reg_data);
uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
uint32_t pipe_id,
uint32_t queue_id,
uint32_t inst);
uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
uint32_t pipe_id,
uint32_t queue_id,
uint32_t inst,
unsigned int utimeout);
Loading

0 comments on commit ee0a469

Please sign in to comment.