Skip to content

Commit

Permalink
drm/amdgpu: Fix SDMA RAS error reporting on Aldebaran
Browse files Browse the repository at this point in the history
Fix the following issues with SDMA RAS error reporting:
1. Read the EDC_COUNTER2 register also to fetch error counts
   for all sub-blocks in SDMA.
2. SDMA RAS on Aldebaran suports single-bit uncorrectable errors
   only. So, report error count in UE count instead of CE count.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-By: John Clements <John.Clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Mukul Joshi authored and Alex Deucher committed Apr 21, 2021
1 parent 1f0d8e3 commit ceb47e0
Showing 1 changed file with 28 additions and 7 deletions.
35 changes: 28 additions & 7 deletions drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = {
};

static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
uint32_t reg_offset,
uint32_t value,
uint32_t instance,
uint32_t *sec_count)
Expand All @@ -169,6 +170,9 @@ static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,

/* double bits error (multiple bits) error detection is not supported */
for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) {
if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset)
continue;

/* the SDMA_EDC_COUNTER register in each sdma instance
* shares the same sed shift_mask
* */
Expand Down Expand Up @@ -197,13 +201,30 @@ static int sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev,
reg_value = RREG32(reg_offset);
/* double bit error is not supported */
if (reg_value)
sdma_v4_4_get_ras_error_count(adev, reg_value, instance, &sec_count);
/* err_data->ce_count should be initialized to 0
* before calling into this function */
err_data->ce_count += sec_count;
/* double bit error is not supported
* set ue count to 0 */
err_data->ue_count = 0;
sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, reg_value,
instance, &sec_count);

reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2);
reg_value = RREG32(reg_offset);
/* double bit error is not supported */
if (reg_value)
sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, reg_value,
instance, &sec_count);

/*
* err_data->ue_count should be initialized to 0
* before calling into this function
*
* SDMA RAS supports single bit uncorrectable error detection.
* So, increment uncorrectable error count.
*/
err_data->ue_count += sec_count;

/*
* SDMA RAS does not support correctable errors.
* Set ce count to 0.
*/
err_data->ce_count = 0;

return 0;
};
Expand Down

0 comments on commit ceb47e0

Please sign in to comment.