Skip to content

Commit

Permalink
drm/amdgpu: Set EEPROM ras info
Browse files Browse the repository at this point in the history
Set EEPROM ras info: rma status, health percent and bad
page threshold.

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Stanley.Yang authored and Alex Deucher committed Jun 9, 2023
1 parent 7c2551f commit 0bc3137
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 0 deletions.
24 changes: 24 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
u8 csum;
int res;
Expand All @@ -423,6 +424,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE;
rai->rma_status = GPU_HEALTH_USABLE;
/**
* GPU health represented as a percentage.
* 0 means worst health, 100 means fully health.
*/
rai->health_percent = 100;
/* ecc_page_threshold = 0 means disable bad page retirement */
rai->ecc_page_threshold = con->bad_page_cnt_threshold;
} else {
hdr->first_rec_offset = RAS_RECORD_START;
hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
Expand Down Expand Up @@ -712,6 +721,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
"Saved bad pages %d reaches threshold value %d\n",
control->ras_num_recs, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
control->tbl_rai.health_percent = 0;
}
}

if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
Expand Down Expand Up @@ -749,6 +762,17 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
goto Out;
}

/**
* bad page records have been stored in eeprom,
* now calculate gpu health percent
*/
if (amdgpu_bad_page_threshold != 0 &&
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
control->ras_num_recs < ras->bad_page_cnt_threshold)
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
control->ras_num_recs) * 100) /
ras->bad_page_cnt_threshold;

/* Recalc the checksum.
*/
csum = 0;
Expand Down
5 changes: 5 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@

struct amdgpu_device;

enum amdgpu_ras_gpu_health_status {
GPU_HEALTH_USABLE = 0,
GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
};

enum amdgpu_ras_eeprom_err_type {
AMDGPU_RAS_EEPROM_ERR_NA,
AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
Expand Down

0 comments on commit 0bc3137

Please sign in to comment.