Skip to content

Commit

Permalink
drm/amdgpu: Introduce funcs for generating cper record
Browse files Browse the repository at this point in the history
Introduce new functions that are used to generate
cper ue or ce records.

v2: return -ENOMEM instead of false
v2: check return value of fill section function

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Yang Wang <keivnyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Hawking Zhang authored and Alex Deucher committed Feb 17, 2025
1 parent 56316ee commit ad97840
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 13 deletions.
12 changes: 1 addition & 11 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,6 @@

typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);

struct aca_banks {
int nr_banks;
struct list_head list;
};

struct aca_hwip {
int hwid;
int mcatype;
};

static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = {
ACA_BANK_HWID(SMU, 0x01, 0x01),
ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00),
Expand Down Expand Up @@ -111,7 +101,7 @@ static struct aca_regs_dump {
{"STATUS", ACA_REG_IDX_STATUS},
{"ADDR", ACA_REG_IDX_ADDR},
{"MISC", ACA_REG_IDX_MISC0},
{"CONFIG", ACA_REG_IDX_CONFG},
{"CONFIG", ACA_REG_IDX_CONFIG},
{"IPID", ACA_REG_IDX_IPID},
{"SYND", ACA_REG_IDX_SYND},
{"DESTAT", ACA_REG_IDX_DESTAT},
Expand Down
12 changes: 11 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ enum aca_reg_idx {
ACA_REG_IDX_STATUS = 1,
ACA_REG_IDX_ADDR = 2,
ACA_REG_IDX_MISC0 = 3,
ACA_REG_IDX_CONFG = 4,
ACA_REG_IDX_CONFIG = 4,
ACA_REG_IDX_IPID = 5,
ACA_REG_IDX_SYND = 6,
ACA_REG_IDX_DESTAT = 8,
Expand Down Expand Up @@ -114,6 +114,11 @@ enum aca_smu_type {
ACA_SMU_TYPE_COUNT,
};

struct aca_hwip {
int hwid;
int mcatype;
};

struct aca_bank {
enum aca_error_type aca_err_type;
enum aca_smu_type smu_err_type;
Expand All @@ -125,6 +130,11 @@ struct aca_bank_node {
struct list_head node;
};

struct aca_banks {
int nr_banks;
struct list_head list;
};

struct aca_bank_info {
int die_id;
int socket_id;
Expand Down
108 changes: 108 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/list.h>
#include "amdgpu.h"

static const guid_t MCE = CPER_NOTIFY_MCE;
Expand Down Expand Up @@ -257,6 +258,113 @@ struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
return hdr;
}

int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
struct aca_bank *bank)
{
struct cper_hdr *fatal = NULL;
struct cper_sec_crashdump_reg_data reg_data = { 0 };
int ret;

fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
if (!fatal) {
dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
return -ENOMEM;
}

reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);

amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
if (ret)
return ret;

/*TODO: commit the cper entry to cper ring */

return 0;
}

static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
enum aca_error_type aca_err_type)
{
switch (aca_err_type) {
case ACA_ERROR_TYPE_UE:
return CPER_SEV_FATAL;
case ACA_ERROR_TYPE_CE:
return CPER_SEV_NON_FATAL_CORRECTED;
case ACA_ERROR_TYPE_DEFERRED:
return CPER_SEV_NON_FATAL_UNCORRECTED;
default:
dev_err(adev->dev, "Unknown ACA error type!\n");
return CPER_SEV_FATAL;
}
}

int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
struct aca_banks *banks,
uint16_t bank_count)
{
struct cper_hdr *corrected = NULL;
enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
struct aca_bank_node *node;
struct aca_bank *bank;
uint32_t i = 0;
int ret;

corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
if (!corrected) {
dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
return -ENOMEM;
}

/* Raise severity if any DE is detected in the ACA bank list */
list_for_each_entry(node, &banks->list, node) {
bank = &node->bank;
if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
sev = CPER_SEV_NON_FATAL_UNCORRECTED;
break;
}
}

amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);

/* Combine CE and UE in cper record */
list_for_each_entry(node, &banks->list, node) {
bank = &node->bank;
reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);

ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
reg_data, CPER_ACA_REG_COUNT);
if (ret)
return ret;
}

/*TODO: commit the cper entry to cper ring */

return 0;
}

int amdgpu_cper_init(struct amdgpu_device *adev)
{
mutex_init(&adev->cper.cper_lock);
Expand Down
9 changes: 8 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#define __AMDGPU_CPER_H__

#include "amd_cper.h"
#include "amdgpu_aca.h"

#define CPER_MAX_ALLOWED_COUNT 0x1000
#define HDR_LEN (sizeof(struct cper_hdr))
Expand Down Expand Up @@ -84,7 +85,13 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
enum amdgpu_cper_type type,
uint16_t section_count);

/* UE must be encoded into separated cper entries, 1 UE 1 cper */
int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
struct aca_bank *bank);
/* CEs and DEs are combined into 1 cper entry */
int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
struct aca_banks *banks,
uint16_t bank_count);
int amdgpu_cper_init(struct amdgpu_device *adev);
int amdgpu_cper_fini(struct amdgpu_device *adev);

Expand Down

0 comments on commit ad97840

Please sign in to comment.