Skip to content

Commit

Permalink
ieee1394: sbp2: handle "sbp2util_node_write_no_wait failed"
Browse files Browse the repository at this point in the history
Fix for http://bugzilla.kernel.org/show_bug.cgi?id=6948

Because sbp2 writes to the target's fetch agent's registers from within
atomic context, it cannot sleep to guaranteedly get a free transaction
label.  This may repeatedly lead to "sbp2util_node_write_no_wait failed"
and consequently to SCSI command abortion after timeout.  A likely cause
is that many queue_command softirqs may occur before khpsbpkt (the
ieee1394 driver's thread which cleans up after finished transactions) is
woken up to recycle tlabels.

Sbp2 now schedules a workqueue job whenever sbp2_link_orb_command fails
in sbp2util_node_write_no_wait.  The job will reliably get a transaction
label because it can sleep.

We use the kernel-wide shared workqueue because it is unlikely that the
job itself actually needs to sleep.  In the improbable case that it has
to sleep, it doesn't need to sleep long since the standard transaction
timeout is 100ms.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
  • Loading branch information
Stefan Richter committed Sep 17, 2006
1 parent 2a87418 commit 09ee67a
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 5 deletions.
71 changes: 66 additions & 5 deletions drivers/ieee1394/sbp2.c
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,44 @@ static int sbp2util_node_write_no_wait(struct node_entry *ne, u64 addr,
return 0;
}

static void sbp2util_notify_fetch_agent(struct scsi_id_instance_data *scsi_id,
u64 offset, quadlet_t *data, size_t len)
{
/*
* There is a small window after a bus reset within which the node
* entry's generation is current but the reconnect wasn't completed.
*/
if (atomic_read(&scsi_id->unfinished_reset))
return;

if (hpsb_node_write(scsi_id->ne,
scsi_id->sbp2_command_block_agent_addr + offset,
data, len))
SBP2_ERR("sbp2util_notify_fetch_agent failed.");
/*
* Now accept new SCSI commands, unless a bus reset happended during
* hpsb_node_write.
*/
if (!atomic_read(&scsi_id->unfinished_reset))
scsi_unblock_requests(scsi_id->scsi_host);
}

static void sbp2util_write_orb_pointer(void *p)
{
quadlet_t data[2];

data[0] = ORB_SET_NODE_ID(
((struct scsi_id_instance_data *)p)->hi->host->node_id);
data[1] = ((struct scsi_id_instance_data *)p)->last_orb_dma;
sbp2util_cpu_to_be32_buffer(data, 8);
sbp2util_notify_fetch_agent(p, SBP2_ORB_POINTER_OFFSET, data, 8);
}

static void sbp2util_write_doorbell(void *p)
{
sbp2util_notify_fetch_agent(p, SBP2_DOORBELL_OFFSET, NULL, 4);
}

/*
* This function is called to create a pool of command orbs used for
* command processing. It is called when a new sbp2 device is detected.
Expand Down Expand Up @@ -712,6 +750,7 @@ static int sbp2_remove(struct device *dev)
sbp2scsi_complete_all_commands(scsi_id, DID_NO_CONNECT);
/* scsi_remove_device() will trigger shutdown functions of SCSI
* highlevel drivers which would deadlock if blocked. */
atomic_set(&scsi_id->unfinished_reset, 0);
scsi_unblock_requests(scsi_id->scsi_host);
}
sdev = scsi_id->sdev;
Expand Down Expand Up @@ -765,6 +804,7 @@ static int sbp2_update(struct unit_directory *ud)

/* Make sure we unblock requests (since this is likely after a bus
* reset). */
atomic_set(&scsi_id->unfinished_reset, 0);
scsi_unblock_requests(scsi_id->scsi_host);

return 0;
Expand Down Expand Up @@ -795,6 +835,8 @@ static struct scsi_id_instance_data *sbp2_alloc_device(struct unit_directory *ud
INIT_LIST_HEAD(&scsi_id->sbp2_command_orb_completed);
INIT_LIST_HEAD(&scsi_id->scsi_list);
spin_lock_init(&scsi_id->sbp2_command_orb_lock);
atomic_set(&scsi_id->unfinished_reset, 0);
INIT_WORK(&scsi_id->protocol_work, NULL, NULL);

ud->device.driver_data = scsi_id;

Expand Down Expand Up @@ -879,8 +921,10 @@ static void sbp2_host_reset(struct hpsb_host *host)
hi = hpsb_get_hostinfo(&sbp2_highlevel, host);

if (hi) {
list_for_each_entry(scsi_id, &hi->scsi_ids, scsi_list)
list_for_each_entry(scsi_id, &hi->scsi_ids, scsi_list) {
atomic_set(&scsi_id->unfinished_reset, 1);
scsi_block_requests(scsi_id->scsi_host);
}
}
}

Expand Down Expand Up @@ -1032,7 +1076,7 @@ static void sbp2_remove_device(struct scsi_id_instance_data *scsi_id)
scsi_remove_host(scsi_id->scsi_host);
scsi_host_put(scsi_id->scsi_host);
}

flush_scheduled_work();
sbp2util_remove_command_orb_pool(scsi_id);

list_del(&scsi_id->scsi_list);
Expand Down Expand Up @@ -1661,6 +1705,10 @@ static int sbp2_agent_reset(struct scsi_id_instance_data *scsi_id, int wait)

SBP2_DEBUG_ENTER();

cancel_delayed_work(&scsi_id->protocol_work);
if (wait)
flush_scheduled_work();

data = ntohl(SBP2_AGENT_RESET_DATA);
addr = scsi_id->sbp2_command_block_agent_addr + SBP2_AGENT_RESET_OFFSET;

Expand Down Expand Up @@ -1982,9 +2030,22 @@ static void sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,

SBP2_ORB_DEBUG("write to %s register, command orb %p",
last_orb ? "DOORBELL" : "ORB_POINTER", command_orb);
if (sbp2util_node_write_no_wait(scsi_id->ne, addr, data, length))
SBP2_ERR("sbp2util_node_write_no_wait failed.\n");
/* We rely on SCSI EH to deal with _node_write_ failures. */
if (sbp2util_node_write_no_wait(scsi_id->ne, addr, data, length)) {
/*
* sbp2util_node_write_no_wait failed. We certainly ran out
* of transaction labels, perhaps just because there were no
* context switches which gave khpsbpkt a chance to collect
* free tlabels. Try again in non-atomic context. If necessary,
* the workqueue job will sleep to guaranteedly get a tlabel.
* We do not accept new commands until the job is over.
*/
scsi_block_requests(scsi_id->scsi_host);
PREPARE_WORK(&scsi_id->protocol_work,
last_orb ? sbp2util_write_doorbell:
sbp2util_write_orb_pointer,
scsi_id);
schedule_work(&scsi_id->protocol_work);
}
}

/*
Expand Down
3 changes: 3 additions & 0 deletions drivers/ieee1394/sbp2.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,9 @@ struct scsi_id_instance_data {

/* Device specific workarounds/brokeness */
unsigned workarounds;

atomic_t unfinished_reset;
struct work_struct protocol_work;
};

/* Sbp2 host data structure (one per IEEE1394 host) */
Expand Down

0 comments on commit 09ee67a

Please sign in to comment.