Skip to content

Commit

Permalink
powerpc/eeh: Defer printing stack trace
Browse files Browse the repository at this point in the history
Currently we print a stack trace in the event handler to help with
debugging EEH issues. In the case of suprise hot-unplug this is unneeded,
so we want to prevent printing the stack trace unless we know it's due to
an actual device error. To accomplish this, we can save a stack trace at
the point of detection and only print it once the EEH recovery handler has
determined the freeze was due to an actual error.

Since the whole point of this is to prevent spurious EEH output we also
move a few prints out of the detection thread, or mark them as pr_debug
so anyone interested can get output from the eeh_check_dev_failure()
if they want.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-6-oohall@gmail.com
  • Loading branch information
Oliver O'Halloran authored and Michael Ellerman committed Sep 5, 2019
1 parent b104af5 commit 25baf3d
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 26 deletions.
11 changes: 11 additions & 0 deletions arch/powerpc/include/asm/eeh.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,17 @@ struct eeh_pe {
struct list_head child_list; /* List of PEs below this PE */
struct list_head child; /* Memb. child_list/eeh_phb_pe */
struct list_head edevs; /* List of eeh_dev in this PE */

/*
* Saved stack trace. When we find a PE freeze in eeh_dev_check_failure
* the stack trace is saved here so we can print it in the recovery
* thread if it turns out to due to a real problem rather than
* a hot-remove.
*
* A max of 64 entries might be overkill, but it also might not be.
*/
unsigned long stack_trace[64];
int trace_entries;
};

#define eeh_pe_for_each_dev(pe, edev, tmp) \
Expand Down
15 changes: 4 additions & 11 deletions arch/powerpc/kernel/eeh.c
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
eeh_pe_mark_isolated(phb_pe);
eeh_serialize_unlock(flags);

pr_err("EEH: PHB#%x failure detected, location: %s\n",
pr_debug("EEH: PHB#%x failure detected, location: %s\n",
phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
dump_stack();
eeh_send_failure_event(phb_pe);

return 1;
out:
eeh_serialize_unlock(flags);
Expand All @@ -451,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
unsigned long flags;
struct device_node *dn;
struct pci_dev *dev;
struct eeh_pe *pe, *parent_pe, *phb_pe;
struct eeh_pe *pe, *parent_pe;
int rc = 0;
const char *location = NULL;

Expand Down Expand Up @@ -581,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
* a stack trace will help the device-driver authors figure
* out what happened. So print that out.
*/
phb_pe = eeh_phb_pe_get(pe->phb);
pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
pe->phb->global_number, pe->addr);
pr_err("EEH: PE location: %s, PHB location: %s\n",
eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
dump_stack();

pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
__func__, pe->phb->global_number, pe->addr);
eeh_send_failure_event(pe);

return 1;
Expand Down
38 changes: 37 additions & 1 deletion arch/powerpc/kernel/eeh_driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -863,8 +863,44 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (eeh_slot_presence_check(edev->pdev))
devices++;

if (!devices)
if (!devices) {
pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
pe->phb->global_number, pe->addr);
goto out; /* nothing to recover */
}

/* Log the event */
if (pe->type & EEH_PE_PHB) {
pr_err("EEH: PHB#%x failure detected, location: %s\n",
pe->phb->global_number, eeh_pe_loc_get(pe));
} else {
struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);

pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
pe->phb->global_number, pe->addr);
pr_err("EEH: PE location: %s, PHB location: %s\n",
eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
}

/*
* Print the saved stack trace now that we've verified there's
* something to recover.
*/
if (pe->trace_entries) {
void **ptrs = (void **) pe->stack_trace;
int i;

pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
pe->phb->global_number, pe->addr);

/* FIXME: Use the same format as dump_stack() */
pr_err("EEH: Call Trace:\n");
for (i = 0; i < pe->trace_entries; i++)
pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);

pe->trace_entries = 0;
}


eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
Expand Down
26 changes: 12 additions & 14 deletions arch/powerpc/kernel/eeh_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy)
{
unsigned long flags;
struct eeh_event *event;
struct eeh_pe *pe;

while (!kthread_should_stop()) {
if (wait_for_completion_interruptible(&eeh_eventlist_event))
Expand All @@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy)
continue;

/* We might have event without binding PE */
pe = event->pe;
if (pe) {
if (pe->type & EEH_PE_PHB)
pr_info("EEH: Detected error on PHB#%x\n",
pe->phb->global_number);
else
pr_info("EEH: Detected PCI bus error on "
"PHB#%x-PE#%x\n",
pe->phb->global_number, pe->addr);
eeh_handle_normal_event(pe);
} else {
if (event->pe)
eeh_handle_normal_event(event->pe);
else
eeh_handle_special_event();
}

kfree(event);
}
Expand Down Expand Up @@ -126,8 +116,16 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
* This prevents the PE from being free()ed by a hotplug driver
* while the PE is sitting in the event queue.
*/
if (pe)
if (pe) {
/*
* Save the current stack trace so we can dump it from the
* event handler thread.
*/
pe->trace_entries = stack_trace_save(pe->stack_trace,
ARRAY_SIZE(pe->stack_trace), 0);

eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
}

/* We may or may not be called in an interrupt context */
spin_lock_irqsave(&eeh_eventlist_lock, flags);
Expand Down

0 comments on commit 25baf3d

Please sign in to comment.