Skip to content

Commit

Permalink
[PATCH] powerpc/pseries: clear PCI failure counter if no new failures
Browse files Browse the repository at this point in the history
The current PCI error recovery system keeps track of the number of PCI card
resets, and refuses to bring a card back up if this number is too large.
The goal of doing this was to avoid an infinite loop of resets if a card is
obviously dead.  However, if the failures are rare, but the machine has a
high uptime, this mechanism might still be triggered; this is too harsh.

This patch will avoids this problem by decrementing the fail count after an
hour.  Thus, as long as a pci card BSOD's less than 6 times an hour, it
will continue to be reset indefinitely.  If it's failure rate is greater
than that, it will be taken off-line permanently.

This patch is larger than it might otherwise be because it changes
indentation by removing a pointless while-loop.  The while loop is not
needed, as the handler is invoked once fo each event (by schedule_work());
the loop is leftover cruft from an earlier implementation.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
  • Loading branch information
Linas Vepstas authored and Paul Mackerras committed Apr 22, 2006
1 parent 4bd174f commit ac325ac
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 33 deletions.
13 changes: 7 additions & 6 deletions arch/powerpc/platforms/pseries/eeh_driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@
*
*/
#include <linux/delay.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/irq.h>
#include <linux/pci.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
Expand Down Expand Up @@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
*/
#define MAX_WAIT_FOR_RECOVERY 15

void handle_eeh_events (struct eeh_event *event)
struct pci_dn * handle_eeh_events (struct eeh_event *event)
{
struct device_node *frozen_dn;
struct pci_dn *frozen_pdn;
Expand All @@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event)
if (!frozen_dn) {
printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n",
pci_name(event->dev));
return;
return NULL;
}

/* There are two different styles for coming up with the PE.
Expand All @@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event)
if (!frozen_bus) {
printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n",
frozen_dn->full_name);
return;
return NULL;
}

#if 0
Expand Down Expand Up @@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event)
/* Tell all device drivers that they can resume operations */
pci_walk_bus(frozen_bus, eeh_report_resume, NULL);

return;
return frozen_pdn;

excess_failures:
/*
Expand Down Expand Up @@ -384,6 +383,8 @@ void handle_eeh_events (struct eeh_event *event)

/* Shut down the device drivers for good. */
pcibios_remove_pci_devices(frozen_bus);

return NULL;
}

/* ---------- end of file ---------- */
50 changes: 28 additions & 22 deletions arch/powerpc/platforms/pseries/eeh_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
*/

#include <linux/delay.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/pci.h>
Expand Down Expand Up @@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy)
{
unsigned long flags;
struct eeh_event *event;
struct pci_dn *pdn;

daemonize ("eehd");
set_current_state(TASK_INTERRUPTIBLE);

while (1) {
set_current_state(TASK_INTERRUPTIBLE);
spin_lock_irqsave(&eeh_eventlist_lock, flags);
event = NULL;

spin_lock_irqsave(&eeh_eventlist_lock, flags);
event = NULL;
/* Unqueue the event, get ready to process. */
if (!list_empty(&eeh_eventlist)) {
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list);
}
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);

/* Unqueue the event, get ready to process. */
if (!list_empty(&eeh_eventlist)) {
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list);
}
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
if (event == NULL)
return 0;

if (event == NULL)
break;
/* Serialize processing of EEH events */
mutex_lock(&eeh_event_mutex);
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);

/* Serialize processing of EEH events */
mutex_lock(&eeh_event_mutex);
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
pci_name(event->dev));

printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
pci_name(event->dev));
pdn = handle_eeh_events(event);

handle_eeh_events(event);
eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
pci_dev_put(event->dev);
kfree(event);
mutex_unlock(&eeh_event_mutex);

eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
pci_dev_put(event->dev);
kfree(event);
mutex_unlock(&eeh_event_mutex);
/* If there are no new errors after an hour, clear the counter. */
if (pdn && pdn->eeh_freeze_count>0) {
msleep_interruptible (3600*1000);
if (pdn->eeh_freeze_count>0)
pdn->eeh_freeze_count--;
}

return 0;
Expand Down
10 changes: 5 additions & 5 deletions include/asm-powerpc/eeh_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
*/

#ifndef ASM_PPC64_EEH_EVENT_H
#define ASM_PPC64_EEH_EVENT_H
#ifndef ASM_POWERPC_EEH_EVENT_H
#define ASM_POWERPC_EEH_EVENT_H
#ifdef __KERNEL__

/** EEH event -- structure holding pci controller data that describes
Expand All @@ -39,7 +39,7 @@ struct eeh_event {
* @dev pci device
*
* This routine builds a PCI error event which will be delivered
* to all listeners on the peh_notifier_chain.
* to all listeners on the eeh_notifier_chain.
*
* This routine can be called within an interrupt context;
* the actual event will be delivered in a normal context
Expand All @@ -51,7 +51,7 @@ int eeh_send_failure_event (struct device_node *dn,
int time_unavail);

/* Main recovery function */
void handle_eeh_events (struct eeh_event *);
struct pci_dn * handle_eeh_events (struct eeh_event *);

#endif /* __KERNEL__ */
#endif /* ASM_PPC64_EEH_EVENT_H */
#endif /* ASM_POWERPC_EEH_EVENT_H */

0 comments on commit ac325ac

Please sign in to comment.