Skip to content

Commit

Permalink
[IB] mthca: first pass at catastrophic error reporting
Browse files Browse the repository at this point in the history
Add some initial support for detecting and reporting catastrophic
errors reported by Mellanox HCAs.  We start a periodic timer which
polls the catastrophic error reporting buffer in device memory.  If an
error is detected, we dump the contents of the buffer for port-mortem
debugging, and report a fatal asynchronous error to higher levels.

In the future we can try to recover from these errors by resetting the
device, but this will require some work in higher-level code as well.
Let's get this in now, so that we at least get catastrophic errors
reported in logs.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Roland Dreier committed Oct 27, 2005
1 parent 7cc656e commit 3d155f8
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 1 deletion.
3 changes: 2 additions & 1 deletion drivers/infiniband/hw/mthca/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o
mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \
mthca_catas.o
153 changes: 153 additions & 0 deletions drivers/infiniband/hw/mthca/mthca_catas.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright (c) 2005 Cisco Systems. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* $Id$
*/

#include "mthca_dev.h"

enum {
MTHCA_CATAS_POLL_INTERVAL = 5 * HZ,

MTHCA_CATAS_TYPE_INTERNAL = 0,
MTHCA_CATAS_TYPE_UPLINK = 3,
MTHCA_CATAS_TYPE_DDR = 4,
MTHCA_CATAS_TYPE_PARITY = 5,
};

static DEFINE_SPINLOCK(catas_lock);

static void handle_catas(struct mthca_dev *dev)
{
struct ib_event event;
const char *type;
int i;

event.device = &dev->ib_dev;
event.event = IB_EVENT_DEVICE_FATAL;
event.element.port_num = 0;

ib_dispatch_event(&event);

switch (swab32(readl(dev->catas_err.map)) >> 24) {
case MTHCA_CATAS_TYPE_INTERNAL:
type = "internal error";
break;
case MTHCA_CATAS_TYPE_UPLINK:
type = "uplink bus error";
break;
case MTHCA_CATAS_TYPE_DDR:
type = "DDR data error";
break;
case MTHCA_CATAS_TYPE_PARITY:
type = "internal parity error";
break;
default:
type = "unknown error";
break;
}

mthca_err(dev, "Catastrophic error detected: %s\n", type);
for (i = 0; i < dev->catas_err.size; ++i)
mthca_err(dev, " buf[%02x]: %08x\n",
i, swab32(readl(dev->catas_err.map + i)));
}

static void poll_catas(unsigned long dev_ptr)
{
struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
unsigned long flags;
int i;

for (i = 0; i < dev->catas_err.size; ++i)
if (readl(dev->catas_err.map + i)) {
handle_catas(dev);
return;
}

spin_lock_irqsave(&catas_lock, flags);
if (dev->catas_err.stop)
mod_timer(&dev->catas_err.timer,
jiffies + MTHCA_CATAS_POLL_INTERVAL);
spin_unlock_irqrestore(&catas_lock, flags);

return;
}

void mthca_start_catas_poll(struct mthca_dev *dev)
{
unsigned long addr;

init_timer(&dev->catas_err.timer);
dev->catas_err.stop = 0;
dev->catas_err.map = NULL;

addr = pci_resource_start(dev->pdev, 0) +
((pci_resource_len(dev->pdev, 0) - 1) &
dev->catas_err.addr);

if (!request_mem_region(addr, dev->catas_err.size * 4,
DRV_NAME)) {
mthca_warn(dev, "couldn't request catastrophic error region "
"at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4);
return;
}

dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
if (!dev->catas_err.map) {
mthca_warn(dev, "couldn't map catastrophic error region "
"at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4);
release_mem_region(addr, dev->catas_err.size * 4);
return;
}

dev->catas_err.timer.data = (unsigned long) dev;
dev->catas_err.timer.function = poll_catas;
dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL;
add_timer(&dev->catas_err.timer);
}

void mthca_stop_catas_poll(struct mthca_dev *dev)
{
spin_lock_irq(&catas_lock);
dev->catas_err.stop = 1;
spin_unlock_irq(&catas_lock);

del_timer_sync(&dev->catas_err.timer);

if (dev->catas_err.map) {
iounmap(dev->catas_err.map);
release_mem_region(pci_resource_start(dev->pdev, 0) +
((pci_resource_len(dev->pdev, 0) - 1) &
dev->catas_err.addr),
dev->catas_err.size * 4);
}
}
5 changes: 5 additions & 0 deletions drivers/infiniband/hw/mthca/mthca_cmd.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
* Copyright (c) 2005 Mellanox Technologies. All rights reserved.
* Copyright (c) 2005 Cisco Systems. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -706,9 +707,13 @@ int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status)

MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
dev->cmd.max_cmds = 1 << lg;
MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET);
MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET);

mthca_dbg(dev, "FW version %012llx, max commands %d\n",
(unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n",
(unsigned long long) dev->catas_err.addr, dev->catas_err.size);

if (mthca_is_memfree(dev)) {
MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET);
Expand Down
13 changes: 13 additions & 0 deletions drivers/infiniband/hw/mthca/mthca_dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,14 @@ struct mthca_mcg_table {
struct mthca_icm_table *table;
};

struct mthca_catas_err {
u64 addr;
u32 __iomem *map;
unsigned long stop;
u32 size;
struct timer_list timer;
};

struct mthca_dev {
struct ib_device ib_dev;
struct pci_dev *pdev;
Expand Down Expand Up @@ -318,6 +326,8 @@ struct mthca_dev {
struct mthca_av_table av_table;
struct mthca_mcg_table mcg_table;

struct mthca_catas_err catas_err;

struct mthca_uar driver_uar;
struct mthca_db_table *db_tab;
struct mthca_pd driver_pd;
Expand Down Expand Up @@ -405,6 +415,9 @@ void mthca_cleanup_mcg_table(struct mthca_dev *dev);
int mthca_register_device(struct mthca_dev *dev);
void mthca_unregister_device(struct mthca_dev *dev);

void mthca_start_catas_poll(struct mthca_dev *dev);
void mthca_stop_catas_poll(struct mthca_dev *dev);

int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);

Expand Down
3 changes: 3 additions & 0 deletions drivers/infiniband/hw/mthca/mthca_provider.c
Original file line number Diff line number Diff line change
Expand Up @@ -1175,10 +1175,13 @@ int mthca_register_device(struct mthca_dev *dev)
}
}

mthca_start_catas_poll(dev);

return 0;
}

void mthca_unregister_device(struct mthca_dev *dev)
{
mthca_stop_catas_poll(dev);
ib_unregister_device(&dev->ib_dev);
}

0 comments on commit 3d155f8

Please sign in to comment.