Skip to content

Commit

Permalink
[SCSI] hpsa: detect controller lockup
Browse files Browse the repository at this point in the history
When controller lockup condition is detected,
we should fail all outstanding commands and disable
the controller.  This will enable multipath solutions
to recover gracefully.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
  • Loading branch information
Stephen M. Cameron authored and James Bottomley committed Oct 30, 2011
1 parent bb158ea commit a0c1241
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 4 deletions.
184 changes: 180 additions & 4 deletions drivers/scsi/hpsa.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include <linux/bitmap.h>
#include <linux/atomic.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
#include "hpsa_cmd.h"
#include "hpsa.h"

Expand Down Expand Up @@ -127,6 +128,10 @@ static struct board_type products[] = {

static int number_of_controllers;

static struct list_head hpsa_ctlr_list = LIST_HEAD_INIT(hpsa_ctlr_list);
static spinlock_t lockup_detector_lock;
static struct task_struct *hpsa_lockup_detector;

static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id);
static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id);
static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg);
Expand Down Expand Up @@ -1337,6 +1342,22 @@ static inline void hpsa_scsi_do_simple_cmd_core(struct ctlr_info *h,
wait_for_completion(&wait);
}

static void hpsa_scsi_do_simple_cmd_core_if_no_lockup(struct ctlr_info *h,
struct CommandList *c)
{
unsigned long flags;

/* If controller lockup detected, fake a hardware error. */
spin_lock_irqsave(&h->lock, flags);
if (unlikely(h->lockup_detected)) {
spin_unlock_irqrestore(&h->lock, flags);
c->err_info->CommandStatus = CMD_HARDWARE_ERR;
} else {
spin_unlock_irqrestore(&h->lock, flags);
hpsa_scsi_do_simple_cmd_core(h, c);
}
}

static void hpsa_scsi_do_simple_cmd_with_retry(struct ctlr_info *h,
struct CommandList *c, int data_direction)
{
Expand Down Expand Up @@ -2052,8 +2073,14 @@ static int hpsa_scsi_queue_command_lck(struct scsi_cmnd *cmd,
}
memcpy(scsi3addr, dev->scsi3addr, sizeof(scsi3addr));

/* Need a lock as this is being allocated from the pool */
spin_lock_irqsave(&h->lock, flags);
if (unlikely(h->lockup_detected)) {
spin_unlock_irqrestore(&h->lock, flags);
cmd->result = DID_ERROR << 16;
done(cmd);
return 0;
}
/* Need a lock as this is being allocated from the pool */
c = cmd_alloc(h);
spin_unlock_irqrestore(&h->lock, flags);
if (c == NULL) { /* trouble... */
Expand Down Expand Up @@ -2605,7 +2632,7 @@ static int hpsa_passthru_ioctl(struct ctlr_info *h, void __user *argp)
c->SG[0].Len = iocommand.buf_size;
c->SG[0].Ext = 0; /* we are not chaining*/
}
hpsa_scsi_do_simple_cmd_core(h, c);
hpsa_scsi_do_simple_cmd_core_if_no_lockup(h, c);
if (iocommand.buf_size > 0)
hpsa_pci_unmap(h->pdev, c, 1, PCI_DMA_BIDIRECTIONAL);
check_ioctl_unit_attention(h, c);
Expand Down Expand Up @@ -2728,7 +2755,7 @@ static int hpsa_big_passthru_ioctl(struct ctlr_info *h, void __user *argp)
c->SG[i].Ext = 0;
}
}
hpsa_scsi_do_simple_cmd_core(h, c);
hpsa_scsi_do_simple_cmd_core_if_no_lockup(h, c);
if (sg_used)
hpsa_pci_unmap(h->pdev, c, sg_used, PCI_DMA_BIDIRECTIONAL);
check_ioctl_unit_attention(h, c);
Expand Down Expand Up @@ -3097,6 +3124,7 @@ static irqreturn_t hpsa_intx_discard_completions(int irq, void *dev_id)
if (interrupt_not_for_us(h))
return IRQ_NONE;
spin_lock_irqsave(&h->lock, flags);
h->last_intr_timestamp = get_jiffies_64();
while (interrupt_pending(h)) {
raw_tag = get_next_completion(h);
while (raw_tag != FIFO_EMPTY)
Expand All @@ -3116,6 +3144,7 @@ static irqreturn_t hpsa_msix_discard_completions(int irq, void *dev_id)
return IRQ_NONE;

spin_lock_irqsave(&h->lock, flags);
h->last_intr_timestamp = get_jiffies_64();
raw_tag = get_next_completion(h);
while (raw_tag != FIFO_EMPTY)
raw_tag = next_command(h);
Expand All @@ -3132,6 +3161,7 @@ static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id)
if (interrupt_not_for_us(h))
return IRQ_NONE;
spin_lock_irqsave(&h->lock, flags);
h->last_intr_timestamp = get_jiffies_64();
while (interrupt_pending(h)) {
raw_tag = get_next_completion(h);
while (raw_tag != FIFO_EMPTY) {
Expand All @@ -3152,6 +3182,7 @@ static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id)
u32 raw_tag;

spin_lock_irqsave(&h->lock, flags);
h->last_intr_timestamp = get_jiffies_64();
raw_tag = get_next_completion(h);
while (raw_tag != FIFO_EMPTY) {
if (hpsa_tag_contains_index(raw_tag))
Expand Down Expand Up @@ -4089,6 +4120,149 @@ static void hpsa_undo_allocations_after_kdump_soft_reset(struct ctlr_info *h)
kfree(h);
}

static void remove_ctlr_from_lockup_detector_list(struct ctlr_info *h)
{
assert_spin_locked(&lockup_detector_lock);
if (!hpsa_lockup_detector)
return;
if (h->lockup_detected)
return; /* already stopped the lockup detector */
list_del(&h->lockup_list);
}

/* Called when controller lockup detected. */
static void fail_all_cmds_on_list(struct ctlr_info *h, struct list_head *list)
{
struct CommandList *c = NULL;

assert_spin_locked(&h->lock);
/* Mark all outstanding commands as failed and complete them. */
while (!list_empty(list)) {
c = list_entry(list->next, struct CommandList, list);
c->err_info->CommandStatus = CMD_HARDWARE_ERR;
finish_cmd(c, c->Header.Tag.lower);
}
}

static void controller_lockup_detected(struct ctlr_info *h)
{
unsigned long flags;

assert_spin_locked(&lockup_detector_lock);
remove_ctlr_from_lockup_detector_list(h);
h->access.set_intr_mask(h, HPSA_INTR_OFF);
spin_lock_irqsave(&h->lock, flags);
h->lockup_detected = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET);
spin_unlock_irqrestore(&h->lock, flags);
dev_warn(&h->pdev->dev, "Controller lockup detected: 0x%08x\n",
h->lockup_detected);
pci_disable_device(h->pdev);
spin_lock_irqsave(&h->lock, flags);
fail_all_cmds_on_list(h, &h->cmpQ);
fail_all_cmds_on_list(h, &h->reqQ);
spin_unlock_irqrestore(&h->lock, flags);
}

#define HEARTBEAT_SAMPLE_INTERVAL (10 * HZ)
#define HEARTBEAT_CHECK_MINIMUM_INTERVAL (HEARTBEAT_SAMPLE_INTERVAL / 2)

static void detect_controller_lockup(struct ctlr_info *h)
{
u64 now;
u32 heartbeat;
unsigned long flags;

assert_spin_locked(&lockup_detector_lock);
now = get_jiffies_64();
/* If we've received an interrupt recently, we're ok. */
if (time_after64(h->last_intr_timestamp +
(HEARTBEAT_CHECK_MINIMUM_INTERVAL), now))
return;

/*
* If we've already checked the heartbeat recently, we're ok.
* This could happen if someone sends us a signal. We
* otherwise don't care about signals in this thread.
*/
if (time_after64(h->last_heartbeat_timestamp +
(HEARTBEAT_CHECK_MINIMUM_INTERVAL), now))
return;

/* If heartbeat has not changed since we last looked, we're not ok. */
spin_lock_irqsave(&h->lock, flags);
heartbeat = readl(&h->cfgtable->HeartBeat);
spin_unlock_irqrestore(&h->lock, flags);
if (h->last_heartbeat == heartbeat) {
controller_lockup_detected(h);
return;
}

/* We're ok. */
h->last_heartbeat = heartbeat;
h->last_heartbeat_timestamp = now;
}

static int detect_controller_lockup_thread(void *notused)
{
struct ctlr_info *h;
unsigned long flags;

while (1) {
struct list_head *this, *tmp;

schedule_timeout_interruptible(HEARTBEAT_SAMPLE_INTERVAL);
if (kthread_should_stop())
break;
spin_lock_irqsave(&lockup_detector_lock, flags);
list_for_each_safe(this, tmp, &hpsa_ctlr_list) {
h = list_entry(this, struct ctlr_info, lockup_list);
detect_controller_lockup(h);
}
spin_unlock_irqrestore(&lockup_detector_lock, flags);
}
return 0;
}

static void add_ctlr_to_lockup_detector_list(struct ctlr_info *h)
{
unsigned long flags;

spin_lock_irqsave(&lockup_detector_lock, flags);
list_add_tail(&h->lockup_list, &hpsa_ctlr_list);
spin_unlock_irqrestore(&lockup_detector_lock, flags);
}

static void start_controller_lockup_detector(struct ctlr_info *h)
{
/* Start the lockup detector thread if not already started */
if (!hpsa_lockup_detector) {
spin_lock_init(&lockup_detector_lock);
hpsa_lockup_detector =
kthread_run(detect_controller_lockup_thread,
NULL, "hpsa");
}
if (!hpsa_lockup_detector) {
dev_warn(&h->pdev->dev,
"Could not start lockup detector thread\n");
return;
}
add_ctlr_to_lockup_detector_list(h);
}

static void stop_controller_lockup_detector(struct ctlr_info *h)
{
unsigned long flags;

spin_lock_irqsave(&lockup_detector_lock, flags);
remove_ctlr_from_lockup_detector_list(h);
/* If the list of ctlr's to monitor is empty, stop the thread */
if (list_empty(&hpsa_ctlr_list)) {
kthread_stop(hpsa_lockup_detector);
hpsa_lockup_detector = NULL;
}
spin_unlock_irqrestore(&lockup_detector_lock, flags);
}

static int __devinit hpsa_init_one(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
Expand Down Expand Up @@ -4234,6 +4408,7 @@ static int __devinit hpsa_init_one(struct pci_dev *pdev,

hpsa_hba_inquiry(h);
hpsa_register_scsi(h); /* hook ourselves into SCSI subsystem */
start_controller_lockup_detector(h);
return 1;

clean4:
Expand Down Expand Up @@ -4296,10 +4471,11 @@ static void __devexit hpsa_remove_one(struct pci_dev *pdev)
struct ctlr_info *h;

if (pci_get_drvdata(pdev) == NULL) {
dev_err(&pdev->dev, "unable to remove device \n");
dev_err(&pdev->dev, "unable to remove device\n");
return;
}
h = pci_get_drvdata(pdev);
stop_controller_lockup_detector(h);
hpsa_unregister_scsi(h); /* unhook from SCSI subsystem */
hpsa_shutdown(pdev);
iounmap(h->vaddr);
Expand Down
5 changes: 5 additions & 0 deletions drivers/scsi/hpsa.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ struct ctlr_info {
unsigned char reply_pool_wraparound;
u32 *blockFetchTable;
unsigned char *hba_inquiry_data;
u64 last_intr_timestamp;
u32 last_heartbeat;
u64 last_heartbeat_timestamp;
u32 lockup_detected;
struct list_head lockup_list;
};
#define HPSA_ABORT_MSG 0
#define HPSA_DEVICE_RESET_MSG 1
Expand Down

0 comments on commit a0c1241

Please sign in to comment.