Skip to content

Commit

Permalink
drm/amdgpu: message smu to update bad channel info
Browse files Browse the repository at this point in the history
It should notice SMU to update bad channel info when detected
uncorrectable error in UMC block

Change-Id: I2dc8848affdb53e52891013953ae9383fff5f20f
Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
  • Loading branch information
Stanley.Yang authored and Stanley.Yang committed Mar 21, 2022
1 parent 9338d13 commit 2dd6a04
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 2 deletions.
7 changes: 7 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -2066,6 +2066,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;

max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
Expand All @@ -2090,6 +2091,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
goto free;

amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
}

#ifdef HAVE_SMCA_UMC_V2
Expand Down Expand Up @@ -2284,6 +2290,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}

con->update_channel_flag = false;
con->features = 0;
INIT_LIST_HEAD(&con->head);
/* Might need get this flag from vbios. */
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,9 @@ struct amdgpu_ras {

/* record umc error info queried from smu */
struct umc_ecc_info umc_ecc;

/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
};

struct ras_fs_data {
Expand Down
25 changes: 23 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
u8 csum;
int res;

Expand All @@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)

amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);

control->bad_channel_bitmap = 0;
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
con->update_channel_flag = false;

amdgpu_ras_debugfs_set_ret_size(control);

mutex_unlock(&control->ras_tbl_mutex);
Expand Down Expand Up @@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *record,
const u32 num)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
u32 a, b, i;
u8 *buf, *pp;
int res;
Expand All @@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
/* Encode all of them in one go.
*/
pp = buf;
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__encode_table_record_to_buf(control, &record[i], pp);

/* update bad channel bitmap */
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
}

/* a, first record index to write into.
* b, last record index to write into.
* a = first index to read (fri) + number of records in the table,
Expand Down Expand Up @@ -684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int i, res;
u8 *buf, *pp;
u32 g0, g1;
Expand Down Expand Up @@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
/* Read up everything? Then transform.
*/
pp = buf;
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__decode_table_record_from_buf(control, &record[i], pp);

/* update bad channel bitmap */
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
}
Out:
kfree(buf);
mutex_unlock(&control->ras_tbl_mutex);
Expand Down
4 changes: 4 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
/* Protect table access via this mutex.
*/
struct mutex ras_tbl_mutex;

/* Record channel info which occurred bad pages
*/
u32 bad_channel_bitmap;
};

/*
Expand Down
5 changes: 5 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_ras_save_bad_pages(adev);

amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
}

if (reset)
Expand Down

0 comments on commit 2dd6a04

Please sign in to comment.