Skip to content

Commit

Permalink
powerpc/powernv: Fix machine check reporting of async store errors
Browse files Browse the repository at this point in the history
POWER9 and POWER10 asynchronous machine checks due to stores have their
cause reported in SRR1 but SRR1[42] is set, which in other cases
indicates DSISR cause.

Check for these cases and clear SRR1[42], so the cause matching uses
the i-side (SRR1) table.

Fixes: 7b9f71f ("powerpc/64s: POWER9 machine check handler")
Fixes: 201220b ("powerpc/powernv: Machine check handler for POWER10")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210517140355.2325406-1-npiggin@gmail.com
  • Loading branch information
Nicholas Piggin authored and Michael Ellerman committed Jun 21, 2021
1 parent 99cd49b commit 3729e0e
Showing 1 changed file with 40 additions and 8 deletions.
48 changes: 40 additions & 8 deletions arch/powerpc/kernel/mce_power.c
Original file line number Diff line number Diff line change
Expand Up @@ -481,12 +481,11 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr,
return -1;
}

static int mce_handle_ierror(struct pt_regs *regs,
static int mce_handle_ierror(struct pt_regs *regs, unsigned long srr1,
const struct mce_ierror_table table[],
struct mce_error_info *mce_err, uint64_t *addr,
uint64_t *phys_addr)
{
uint64_t srr1 = regs->msr;
int handled = 0;
int i;

Expand Down Expand Up @@ -695,19 +694,19 @@ static long mce_handle_ue_error(struct pt_regs *regs,
}

static long mce_handle_error(struct pt_regs *regs,
unsigned long srr1,
const struct mce_derror_table dtable[],
const struct mce_ierror_table itable[])
{
struct mce_error_info mce_err = { 0 };
uint64_t addr, phys_addr = ULONG_MAX;
uint64_t srr1 = regs->msr;
long handled;

if (SRR1_MC_LOADSTORE(srr1))
handled = mce_handle_derror(regs, dtable, &mce_err, &addr,
&phys_addr);
else
handled = mce_handle_ierror(regs, itable, &mce_err, &addr,
handled = mce_handle_ierror(regs, srr1, itable, &mce_err, &addr,
&phys_addr);

if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
Expand All @@ -723,16 +722,20 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs)
/* P7 DD1 leaves top bits of DSISR undefined */
regs->dsisr &= 0x0000ffff;

return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table);
return mce_handle_error(regs, regs->msr,
mce_p7_derror_table, mce_p7_ierror_table);
}

long __machine_check_early_realmode_p8(struct pt_regs *regs)
{
return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table);
return mce_handle_error(regs, regs->msr,
mce_p8_derror_table, mce_p8_ierror_table);
}

long __machine_check_early_realmode_p9(struct pt_regs *regs)
{
unsigned long srr1 = regs->msr;

/*
* On POWER9 DD2.1 and below, it's possible to get a machine check
* caused by a paste instruction where only DSISR bit 25 is set. This
Expand All @@ -746,10 +749,39 @@ long __machine_check_early_realmode_p9(struct pt_regs *regs)
if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000)
return 1;

return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
/*
* Async machine check due to bad real address from store or foreign
* link time out comes with the load/store bit (PPC bit 42) set in
* SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're
* directed to the ierror table so it will find the cause (which
* describes it correctly as a store error).
*/
if (SRR1_MC_LOADSTORE(srr1) &&
((srr1 & 0x081c0000) == 0x08140000 ||
(srr1 & 0x081c0000) == 0x08180000)) {
srr1 &= ~PPC_BIT(42);
}

return mce_handle_error(regs, srr1,
mce_p9_derror_table, mce_p9_ierror_table);
}

long __machine_check_early_realmode_p10(struct pt_regs *regs)
{
return mce_handle_error(regs, mce_p10_derror_table, mce_p10_ierror_table);
unsigned long srr1 = regs->msr;

/*
* Async machine check due to bad real address from store comes with
* the load/store bit (PPC bit 42) set in SRR1, but the cause comes in
* SRR1 not DSISR. Clear bit 42 so we're directed to the ierror table
* so it will find the cause (which describes it correctly as a store
* error).
*/
if (SRR1_MC_LOADSTORE(srr1) &&
(srr1 & 0x081c0000) == 0x08140000) {
srr1 &= ~PPC_BIT(42);
}

return mce_handle_error(regs, srr1,
mce_p10_derror_table, mce_p10_ierror_table);
}

0 comments on commit 3729e0e

Please sign in to comment.