Skip to content

Commit

Permalink
drm/amdkfd: fix set kfd node ras properties value
Browse files Browse the repository at this point in the history
The ctx->features are new RAS implementation which
is only available for Vega20 and onwards, it is not
available for vega10, vega10 should follow legacy
ECC implementation.

Changed from V1:
    wrap function to initialize kfd node properties

Changed from V2:
    remove wrap function and SDMA SRAM ECC check

Change-Id: I1e3ff899bf066611fe5775e67104ce2e0bf8b7d0
Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
  • Loading branch information
Stanley.Yang authored and Yang Xiong committed Aug 27, 2020
1 parent 31afcbf commit c56d0ef
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 22 deletions.
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,7 @@ struct amdgpu_device {

atomic_t throttling_logging_enabled;
struct ratelimit_state throttling_logging_rs;
uint32_t ras_features;
};

static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
Expand Down
28 changes: 19 additions & 9 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -1963,6 +1963,17 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
return 0;
}

static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev)
{
if (adev->asic_type != CHIP_VEGA10 &&
adev->asic_type != CHIP_VEGA20 &&
adev->asic_type != CHIP_ARCTURUS &&
adev->asic_type != CHIP_SIENNA_CICHLID)
return 1;
else
return 0;
}

/*
* check hardware's ras ability which will be saved in hw_supported.
* if hardware does not support ras, we can skip some ras initializtion and
Expand All @@ -1979,9 +1990,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
*supported = 0;

if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
(adev->asic_type != CHIP_VEGA20 &&
adev->asic_type != CHIP_ARCTURUS &&
adev->asic_type != CHIP_SIENNA_CICHLID))
amdgpu_ras_check_asic_type(adev))
return;

if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
Expand All @@ -2003,6 +2012,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,

*supported = amdgpu_ras_enable == 0 ?
0 : *hw_supported & amdgpu_ras_mask;
adev->ras_features = *supported;
}

int amdgpu_ras_init(struct amdgpu_device *adev)
Expand All @@ -2025,9 +2035,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)

amdgpu_ras_check_supported(adev, &con->hw_supported,
&con->supported);
if (!con->hw_supported) {
if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
r = 0;
goto err_out;
goto release_con;
}

con->features = 0;
Expand All @@ -2038,25 +2048,25 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (adev->nbio.funcs->init_ras_controller_interrupt) {
r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
if (r)
goto err_out;
goto release_con;
}

if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
if (r)
goto err_out;
goto release_con;
}

if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto err_out;
goto release_con;
}

dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
con->hw_supported, con->supported);
return 0;
err_out:
release_con:
amdgpu_ras_set_context(adev, NULL);
kfree(con);

Expand Down
24 changes: 11 additions & 13 deletions drivers/gpu/drm/amd/amdkfd/kfd_topology.c
Original file line number Diff line number Diff line change
Expand Up @@ -1314,7 +1314,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
void *crat_image = NULL;
size_t image_size = 0;
int proximity_domain;
struct amdgpu_ras *ctx;
struct amdgpu_device *adev;

INIT_LIST_HEAD(&temp_topology_device_list);

Expand Down Expand Up @@ -1488,19 +1488,17 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.max_waves_per_simd = 10;
}

ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd));
if (ctx) {
/* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC */
dev->node_props.capability |=
(((ctx->features & BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) ||
((ctx->features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ?
HSA_CAP_SRAM_EDCSUPPORTED : 0;
dev->node_props.capability |= ((ctx->features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
HSA_CAP_MEM_EDCSUPPORTED : 0;

dev->node_props.capability |= (ctx->features != 0) ?
adev = (struct amdgpu_device *)(dev->gpu->kgd);
/* kfd only concerns sram ecc on GFX and HBM ecc on UMC */
dev->node_props.capability |=
((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
HSA_CAP_SRAM_EDCSUPPORTED : 0;
dev->node_props.capability |= ((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
HSA_CAP_MEM_EDCSUPPORTED : 0;

if (adev->asic_type != CHIP_VEGA10)
dev->node_props.capability |= (adev->ras_features != 0) ?
HSA_CAP_RASEVENTNOTIFY : 0;
}

kfd_debug_print_topology();

Expand Down

0 comments on commit c56d0ef

Please sign in to comment.