Skip to content

Commit

Permalink
IB/ipath: Fix and recover TXE piobuf and PBC parity errors
Browse files Browse the repository at this point in the history
We can sometimes trigger parity errors due to processor speculative
reads to our write-combined memory (mostly seen on Woodcrest).   Add a
stats counter for these.

Factored out the sendbuffererror buffer cancellation code so it can be
used in the new handling; suppress likely subsequent error messages if
within two jiffies of the cancellation.

Also restore 2 dropped TXE lines on hwe_bitsextant noticed while
debugging.

Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Bryan O'Sullivan authored and Roland Dreier committed Sep 28, 2006
1 parent 5108477 commit 89d1e09
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 52 deletions.
3 changes: 2 additions & 1 deletion drivers/infiniband/hw/ipath/ipath_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,9 @@ struct infinipath_stats {
* packets if ipath not configured, etc.)
*/
__u64 sps_krdrops;
__u64 sps_txeparity; /* PIO buffer parity error, recovered */
/* pad for future growth */
__u64 __sps_pad[46];
__u64 __sps_pad[45];
};

/*
Expand Down
32 changes: 31 additions & 1 deletion drivers/infiniband/hw/ipath/ipath_iba6110.c
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,10 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
* make sure we get this much out, unless told to be quiet,
* or it's occurred within the last 5 seconds
*/
if ((hwerrs & ~dd->ipath_lasthwerror) ||
if ((hwerrs & ~(dd->ipath_lasthwerror |
((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
(ipath_debug & __IPATH_VERBDBG))
dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
"(cleared)\n", (unsigned long long) hwerrs);
Expand All @@ -464,6 +467,33 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,

ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
if (ctrl & INFINIPATH_C_FREEZEMODE) {
/*
* parity errors in send memory are recoverable,
* just cancel the send (if indicated in * sendbuffererror),
* count the occurrence, unfreeze (if no other handled
* hardware error bits are set), and continue. They can
* occur if a processor speculative read is done to the PIO
* buffer while we are sending a packet, for example.
*/
if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
ipath_stats.sps_txeparity++;
ipath_dbg("Recovering from TXE parity error (%llu), "
"hwerrstatus=%llx\n",
(unsigned long long) ipath_stats.sps_txeparity,
(unsigned long long) hwerrs);
ipath_disarm_senderrbufs(dd);
hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
if (!hwerrs) { /* else leave in freeze mode */
ipath_write_kreg(dd,
dd->ipath_kregs->kr_control,
dd->ipath_control);
return;
}
}
if (hwerrs) {
/*
* if any set that we aren't ignoring; only
Expand Down
37 changes: 34 additions & 3 deletions drivers/infiniband/hw/ipath/ipath_iba6120.c
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
* make sure we get this much out, unless told to be quiet,
* or it's occurred within the last 5 seconds
*/
if ((hwerrs & ~dd->ipath_lasthwerror) ||
if ((hwerrs & ~(dd->ipath_lasthwerror |
((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
(ipath_debug & __IPATH_VERBDBG))
dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
"(cleared)\n", (unsigned long long) hwerrs);
Expand All @@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,

ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
if (ctrl & INFINIPATH_C_FREEZEMODE) {
/*
* parity errors in send memory are recoverable,
* just cancel the send (if indicated in * sendbuffererror),
* count the occurrence, unfreeze (if no other handled
* hardware error bits are set), and continue. They can
* occur if a processor speculative read is done to the PIO
* buffer while we are sending a packet, for example.
*/
if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
ipath_stats.sps_txeparity++;
ipath_dbg("Recovering from TXE parity error (%llu), "
"hwerrstatus=%llx\n",
(unsigned long long) ipath_stats.sps_txeparity,
(unsigned long long) hwerrs);
ipath_disarm_senderrbufs(dd);
hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
if (!hwerrs) { /* else leave in freeze mode */
ipath_write_kreg(dd,
dd->ipath_kregs->kr_control,
dd->ipath_control);
return;
}
}
if (hwerrs) {
/*
* if any set that we aren't ignoring only make the
Expand All @@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
} else {
ipath_dbg("Clearing freezemode on ignored hardware "
"error\n");
ctrl &= ~INFINIPATH_C_FREEZEMODE;
ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
ctrl);
dd->ipath_control);
}
}

Expand Down Expand Up @@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
dd->ipath_hwe_bitsextant =
(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
(INFINIPATH_HWE_PCIEMEMPARITYERR_MASK <<
INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) |
INFINIPATH_HWE_PCIE1PLLFAILED |
Expand Down
98 changes: 53 additions & 45 deletions drivers/infiniband/hw/ipath/ipath_intr.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,50 @@
#include "ipath_verbs.h"
#include "ipath_common.h"

/*
* Called when we might have an error that is specific to a particular
* PIO buffer, and may need to cancel that buffer, so it can be re-used.
*/
void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
{
u32 piobcnt;
unsigned long sbuf[4];
/*
* it's possible that sendbuffererror could have bits set; might
* have already done this as a result of hardware error handling
*/
piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
/* read these before writing errorclear */
sbuf[0] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror);
sbuf[1] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 1);
if (piobcnt > 128) {
sbuf[2] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 2);
sbuf[3] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 3);
}

if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
int i;
if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG)) {
__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
"SendbufErrs %lx %lx", sbuf[0],
sbuf[1]);
if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
printk(" %lx %lx ", sbuf[2], sbuf[3]);
printk("\n");
}

for (i = 0; i < piobcnt; i++)
if (test_bit(i, sbuf))
ipath_disarm_piobufs(dd, i, 1);
dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */
}
}


/* These are all rcv-related errors which we want to count for stats */
#define E_SUM_PKTERRS \
(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
Expand Down Expand Up @@ -68,53 +112,9 @@

static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
{
unsigned long sbuf[4];
u64 ignore_this_time = 0;
u32 piobcnt;

/* if possible that sendbuffererror could be valid */
piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
/* read these before writing errorclear */
sbuf[0] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror);
sbuf[1] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 1);
if (piobcnt > 128) {
sbuf[2] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 2);
sbuf[3] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 3);
}

if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
int i;

ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]);
if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
printk("%lx %lx ", sbuf[2], sbuf[3]);
for (i = 0; i < piobcnt; i++) {
if (test_bit(i, sbuf)) {
u32 __iomem *piobuf;
if (i < dd->ipath_piobcnt2k)
piobuf = (u32 __iomem *)
(dd->ipath_pio2kbase +
i * dd->ipath_palign);
else
piobuf = (u32 __iomem *)
(dd->ipath_pio4kbase +
(i - dd->ipath_piobcnt2k) *
dd->ipath_4kalign);

ipath_cdbg(PKT,
"PIObuf[%u] @%p pbc is %x; ",
i, piobuf, readl(piobuf));

ipath_disarm_piobufs(dd, i, 1);
}
}
if (ipath_debug & __IPATH_PKTDBG)
printk("\n");
}
ipath_disarm_senderrbufs(dd);
if ((errs & E_SUM_LINK_PKTERRS) &&
!(dd->ipath_flags & IPATH_LINKACTIVE)) {
/*
Expand Down Expand Up @@ -554,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
~(INFINIPATH_E_HARDWARE |
INFINIPATH_E_IBSTATUSCHANGED);
}

/* likely due to cancel, so suppress */
if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
dd->ipath_lastcancel > jiffies) {
ipath_dbg("Suppressed armlaunch/spktlen after error send cancel\n");
errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
}

if (!errs)
return 0;

Expand Down
6 changes: 4 additions & 2 deletions drivers/infiniband/hw/ipath/ipath_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,9 @@ struct ipath_devdata {
unsigned long ipath_rcvctrl;
/* shadow kr_sendctrl */
unsigned long ipath_sendctrl;
/* ports waiting for PIOavail intr */
unsigned long ipath_portpiowait;
unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */

/* value we put in kr_rcvhdrcnt */
u32 ipath_rcvhdrcnt;
Expand Down Expand Up @@ -490,8 +493,6 @@ struct ipath_devdata {
u32 ipath_htwidth;
/* HT speed (200,400,800,1000) from HT config */
u32 ipath_htspeed;
/* ports waiting for PIOavail intr */
unsigned long ipath_portpiowait;
/*
* number of sequential ibcstatus change for polling active/quiet
* (i.e., link not coming up).
Expand Down Expand Up @@ -585,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd);
void ipath_disable_wc(struct ipath_devdata *dd);
int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
void ipath_shutdown_device(struct ipath_devdata *);
void ipath_disarm_senderrbufs(struct ipath_devdata *);

struct file_operations;
int ipath_cdev_init(int minor, char *name, struct file_operations *fops,
Expand Down

0 comments on commit 89d1e09

Please sign in to comment.