Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 13708
b: refs/heads/master
c: fd761fd
h: refs/heads/master
v: v3
  • Loading branch information
Linas Vepstas authored and Paul Mackerras committed Nov 10, 2005
1 parent 1d5408a commit b9d689b
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 14 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 76e6faf7a3a3ad3e18a1b70f9e4cd96cdf58140d
refs/heads/master: fd761fd876e4d1c0d07b6d93bc45c999fa596cb0
98 changes: 85 additions & 13 deletions trunk/arch/ppc64/kernel/eeh.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ static int ibm_slot_error_detail;

static int eeh_subsystem_enabled;

/* Lock to avoid races due to multiple reports of an error */
static DEFINE_SPINLOCK(confirm_error_lock);

/* Buffer for reporting slot-error-detail rtas calls */
static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(slot_errbuf_lock);
Expand Down Expand Up @@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
return pa | (token & (PAGE_SIZE-1));
}

/**
* Return the "partitionable endpoint" (pe) under which this device lies
*/
static struct device_node * find_device_pe(struct device_node *dn)
{
while ((dn->parent) && PCI_DN(dn->parent) &&
(PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
dn = dn->parent;
}
return dn;
}

/** Mark all devices that are peers of this device as failed.
* Mark the device driver too, so that it can see the failure
* immediately; this is critical, since some drivers poll
* status registers in interrupts ... If a driver is polling,
* and the slot is frozen, then the driver can deadlock in
* an interrupt context, which is bad.
*/

static inline void __eeh_mark_slot (struct device_node *dn)
{
while (dn) {
PCI_DN(dn)->eeh_mode |= EEH_MODE_ISOLATED;

if (dn->child)
__eeh_mark_slot (dn->child);
dn = dn->sibling;
}
}

static inline void __eeh_clear_slot (struct device_node *dn)
{
while (dn) {
PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED;
if (dn->child)
__eeh_clear_slot (dn->child);
dn = dn->sibling;
}
}

static inline void eeh_clear_slot (struct device_node *dn)
{
unsigned long flags;
spin_lock_irqsave(&confirm_error_lock, flags);
__eeh_clear_slot (dn);
spin_unlock_irqrestore(&confirm_error_lock, flags);
}

/**
* eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
* @dn device node
Expand All @@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
int reset_state;
struct eeh_event *event;
struct pci_dn *pdn;
struct device_node *pe_dn;
int rc = 0;

__get_cpu_var(total_mmio_ffs)++;

Expand Down Expand Up @@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
return 0;
}

/*
* If we already have a pending isolation event for this
* slot, we know it's bad already, we don't need to check...
/* If we already have a pending isolation event for this
* slot, we know it's bad already, we don't need to check.
* Do this checking under a lock; as multiple PCI devices
* in one slot might report errors simultaneously, and we
* only want one error recovery routine running.
*/
spin_lock_irqsave(&confirm_error_lock, flags);
rc = 1;
if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
atomic_inc(&eeh_fail_count);
if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
Expand All @@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
rets[0] = -1; /* reset state unknown */
eeh_panic(dev, rets[0]);
}
return 0;
goto dn_unlock;
}

/*
Expand All @@ -623,33 +681,42 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
ret, dn->full_name);
__get_cpu_var(false_positives)++;
return 0;
rc = 0;
goto dn_unlock;
}

/* If EEH is not supported on this device, punt. */
if (rets[1] != 1) {
printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
ret, dn->full_name);
__get_cpu_var(false_positives)++;
return 0;
rc = 0;
goto dn_unlock;
}

/* If not the kind of error we know about, punt. */
if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
__get_cpu_var(false_positives)++;
return 0;
rc = 0;
goto dn_unlock;
}

/* Note that config-io to empty slots may fail;
* we recognize empty because they don't have children. */
if ((rets[0] == 5) && (dn->child == NULL)) {
__get_cpu_var(false_positives)++;
return 0;
rc = 0;
goto dn_unlock;
}

/* prevent repeated reports of this failure */
pdn->eeh_mode |= EEH_MODE_ISOLATED;
__get_cpu_var(slot_resets)++;
__get_cpu_var(slot_resets)++;

/* Avoid repeated reports of this failure, including problems
* with other functions on this device, and functions under
* bridges. */
pe_dn = find_device_pe (dn);
__eeh_mark_slot (pe_dn);
spin_unlock_irqrestore(&confirm_error_lock, flags);

reset_state = rets[0];

Expand Down Expand Up @@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
if (rets[0] != 5) dump_stack();
schedule_work(&eeh_event_wq);

return 0;
return 1;

dn_unlock:
spin_unlock_irqrestore(&confirm_error_lock, flags);
return rc;
}

EXPORT_SYMBOL(eeh_dn_check_failure);
EXPORT_SYMBOL_GPL(eeh_dn_check_failure);

/**
* eeh_check_failure - check if all 1's data is due to EEH slot freeze
Expand Down Expand Up @@ -820,6 +891,7 @@ void __init eeh_init(void)
struct device_node *phb, *np;
struct eeh_early_enable_info info;

spin_lock_init(&confirm_error_lock);
spin_lock_init(&slot_errbuf_lock);

np = of_find_node_by_path("/rtas");
Expand Down

0 comments on commit b9d689b

Please sign in to comment.