Skip to content

Commit

Permalink
drm/amdgpu: support gfx ras error injection and err_cnt query
Browse files Browse the repository at this point in the history
check gfx error count in both ras querry function and
ras interrupt handler.

gfx ras is still disabled by default due to known stability
issue found in gpu reset.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Dennis Li authored and Alex Deucher committed Jul 31, 2019
1 parent 2c960ea commit 83b0582
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 3 deletions.
19 changes: 16 additions & 3 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,10 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
if (adev->umc.funcs->query_ras_error_count)
adev->umc.funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->query_ras_error_count)
adev->gfx.funcs->query_ras_error_count(adev, &err_data);
break;
default:
break;
}
Expand Down Expand Up @@ -639,13 +643,22 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
if (!obj)
return -EINVAL;

if (block_info.block_id != TA_RAS_BLOCK__UMC) {
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->ras_error_inject)
ret = adev->gfx.funcs->ras_error_inject(adev, info);
else
ret = -EINVAL;
break;
case AMDGPU_RAS_BLOCK__UMC:
ret = psp_ras_trigger_error(&adev->psp, &block_info);
break;
default:
DRM_INFO("%s error injection is not supported yet\n",
ras_block_str(info->head.block));
return -EINVAL;
ret = -EINVAL;
}

ret = psp_ras_trigger_error(&adev->psp, &block_info);
if (ret)
DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
ras_block_str(info->head.block),
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
Original file line number Diff line number Diff line change
Expand Up @@ -5611,6 +5611,8 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
{
/* TODO ue will trigger an interrupt. */
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->gfx.funcs->query_ras_error_count)
adev->gfx.funcs->query_ras_error_count(adev, err_data);
amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_UE;
}
Expand Down

0 comments on commit 83b0582

Please sign in to comment.