Skip to content

Commit

Permalink
drm/amdgpu: save UMC global channel index to eeprom
Browse files Browse the repository at this point in the history
Save the global channel index returned by RAS TA to eeprom.
We can get memory physical address by MCA address and channel index.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Tao Zhou authored and Alex Deucher committed Dec 10, 2024
1 parent 07dd49e commit 71a0e96
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 11 deletions.
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ struct ras_ecc_err {
uint64_t ipid;
uint64_t addr;
uint64_t pa_pfn;
/* save global channel index across all UMC instances */
uint32_t channel_idx;
struct ras_err_pages err_pages;
};

Expand Down
7 changes: 2 additions & 5 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,9 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
uint64_t err_addr, uint32_t ch, uint32_t umc,
uint32_t node, uint32_t socket,
uint64_t *addr, bool dump_addr)
struct ta_ras_query_address_output *addr_out, bool dump_addr)
{
struct ta_ras_query_address_input addr_in;
struct ta_ras_query_address_output addr_out;
int ret;

memset(&addr_in, 0, sizeof(addr_in));
Expand All @@ -510,14 +509,12 @@ int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,

if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in,
&addr_out, dump_addr);
addr_out, dump_addr);
if (ret)
return ret;
} else {
return 0;
}

*addr = addr_out.pa.pa;

return 0;
}
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,5 +146,5 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
uint64_t err_addr, uint32_t ch, uint32_t umc,
uint32_t node, uint32_t socket,
uint64_t *addr, bool dump_addr);
struct ta_ras_query_address_output *addr_out, bool dump_addr);
#endif
13 changes: 8 additions & 5 deletions drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
bool dump_addr)
{
uint32_t col, col_lower, row, row_lower, bank;
uint32_t channel_index, umc_inst = 0;
uint32_t channel_index = 0, umc_inst = 0;
uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS];
uint64_t soc_pa, column, err_addr;
struct ta_ras_query_address_output addr_out_tmp;
Expand All @@ -193,7 +193,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
else
paddr_out = addr_out;

err_addr = bank = channel_index = 0;
err_addr = bank = 0;
if (addr_in) {
err_addr = addr_in->ma.err_addr;
addr_in->addr_type = TA_RAS_MCA_TO_PA;
Expand All @@ -206,7 +206,6 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
}

bank = paddr_out->pa.bank;
channel_index = paddr_out->pa.channel_idx;
/* no need to care about umc inst if addr_in is NULL */
umc_inst = addr_in->ma.umc_inst;
}
Expand All @@ -228,6 +227,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
}

soc_pa = paddr_out->pa.pa;
channel_index = paddr_out->pa.channel_idx;
/* clear loop bits in soc physical address */
for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++)
soc_pa &= ~BIT_ULL(loop_bits[i]);
Expand Down Expand Up @@ -466,6 +466,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
uint64_t err_addr, pa_addr = 0;
struct ras_ecc_err *ecc_err;
struct ta_ras_query_address_output addr_out;
int count, ret, i;

hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
Expand Down Expand Up @@ -495,18 +496,20 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_umc_mca_to_addr(adev,
err_addr, MCA_IPID_2_UMC_CH(ipid),
MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid),
MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true);
MCA_IPID_2_SOCKET_ID(ipid), &addr_out, true);
if (ret)
return ret;

ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
if (!ecc_err)
return -ENOMEM;

pa_addr = addr_out.pa.pa;
ecc_err->status = status;
ecc_err->ipid = ipid;
ecc_err->addr = addr;
ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT;
ecc_err->channel_idx = addr_out.pa.channel_idx;

/* If converted pa_pfn is 0, use pa C4 pfn. */
if (!ecc_err->pa_pfn)
Expand Down Expand Up @@ -577,7 +580,7 @@ static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
ret = amdgpu_umc_fill_error_record(err_data,
ecc_err->addr,
page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
MCA_IPID_2_UMC_CH(ecc_err->ipid),
ecc_err->channel_idx,
MCA_IPID_2_UMC_INST(ecc_err->ipid));
if (ret)
break;
Expand Down

0 comments on commit 71a0e96

Please sign in to comment.