Skip to content

Commit

Permalink
s390/dasd: channel path aware error recovery
Browse files Browse the repository at this point in the history
With this feature, the DASD device driver more robustly handles DASDs
that are attached via multiple channel paths and are subject to
constant Interface-Control-Checks (IFCCs) and Channel-Control-Checks
(CCCs) or loss of High-Performance-FICON (HPF) functionality on one or
more of these paths.

If a channel path does not work correctly, it is removed from normal
operation as long as other channel paths are available. All extended
error recovery states can be queried and reset via user space
interfaces.

Signed-off-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Reviewed-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Reviewed-by: Jan Hoeppner <hoeppner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
  • Loading branch information
Stefan Haberland authored and Martin Schwidefsky committed Dec 12, 2016
1 parent c934615 commit a521b04
Show file tree
Hide file tree
Showing 7 changed files with 529 additions and 109 deletions.
6 changes: 5 additions & 1 deletion arch/s390/include/asm/scsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ struct tm_scsw {
u32 dstat:8;
u32 cstat:8;
u32 fcxs:8;
u32 schxs:8;
u32 ifob:1;
u32 sesq:7;
} __attribute__ ((packed));

/**
Expand Down Expand Up @@ -177,6 +178,9 @@ union scsw {
#define SCHN_STAT_INTF_CTRL_CHK 0x02
#define SCHN_STAT_CHAIN_CHECK 0x01

#define SCSW_SESQ_DEV_NOFCX 3
#define SCSW_SESQ_PATH_NOFCX 4

/*
* architectured values for first sense byte
*/
Expand Down
149 changes: 111 additions & 38 deletions drivers/s390/block/dasd.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ static void dasd_block_tasklet(struct dasd_block *);
static void do_kick_device(struct work_struct *);
static void do_restore_device(struct work_struct *);
static void do_reload_device(struct work_struct *);
static void do_requeue_requests(struct work_struct *);
static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
static void dasd_device_timeout(unsigned long);
static void dasd_block_timeout(unsigned long);
Expand Down Expand Up @@ -125,6 +126,7 @@ struct dasd_device *dasd_alloc_device(void)
INIT_WORK(&device->kick_work, do_kick_device);
INIT_WORK(&device->restore_device, do_restore_device);
INIT_WORK(&device->reload_device, do_reload_device);
INIT_WORK(&device->requeue_requests, do_requeue_requests);
device->state = DASD_STATE_NEW;
device->target = DASD_STATE_NEW;
mutex_init(&device->state_mutex);
Expand Down Expand Up @@ -1622,6 +1624,13 @@ void dasd_generic_handle_state_change(struct dasd_device *device)
}
EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change);

static int dasd_check_hpf_error(struct irb *irb)
{
return (scsw_tm_is_valid_schxs(&irb->scsw) &&
(irb->scsw.tm.sesq == SCSW_SESQ_DEV_NOFCX ||
irb->scsw.tm.sesq == SCSW_SESQ_PATH_NOFCX));
}

/*
* Interrupt handler for "normal" ssch-io based dasd devices.
*/
Expand Down Expand Up @@ -1748,6 +1757,13 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm,
struct dasd_ccw_req, devlist);
}
} else { /* error */
/* check for HPF error
* call discipline function to requeue all requests
* and disable HPF accordingly
*/
if (cqr->cpmode && dasd_check_hpf_error(irb) &&
device->discipline->handle_hpf_error)
device->discipline->handle_hpf_error(device, irb);
/*
* If we don't want complex ERP for this request, then just
* reset this and retry it in the fastpath
Expand Down Expand Up @@ -2924,10 +2940,10 @@ static int _dasd_requeue_request(struct dasd_ccw_req *cqr)

if (!block)
return -EINVAL;
spin_lock_irqsave(&block->queue_lock, flags);
spin_lock_irqsave(&block->request_queue_lock, flags);
req = (struct request *) cqr->callback_data;
blk_requeue_request(block->request_queue, req);
spin_unlock_irqrestore(&block->queue_lock, flags);
spin_unlock_irqrestore(&block->request_queue_lock, flags);

return 0;
}
Expand Down Expand Up @@ -3701,7 +3717,7 @@ EXPORT_SYMBOL_GPL(dasd_generic_notify);
void dasd_generic_path_event(struct ccw_device *cdev, int *path_event)
{
struct dasd_device *device;
int chp, oldopm;
int chp, oldopm, hpfpm, ifccpm;

device = dasd_device_from_cdev_locked(cdev);
if (IS_ERR(device))
Expand Down Expand Up @@ -3733,7 +3749,30 @@ void dasd_generic_path_event(struct ccw_device *cdev, int *path_event)
device->discipline->kick_validate(device);
}
}
if (oldopm && !dasd_path_get_opm(device)) {
hpfpm = dasd_path_get_hpfpm(device);
ifccpm = dasd_path_get_ifccpm(device);
if (!dasd_path_get_opm(device) && hpfpm) {
/*
* device has no operational paths but at least one path is
* disabled due to HPF errors
* disable HPF at all and use the path(s) again
*/
if (device->discipline->disable_hpf)
device->discipline->disable_hpf(device);
dasd_device_set_stop_bits(device, DASD_STOPPED_NOT_ACC);
dasd_path_set_tbvpm(device, hpfpm);
dasd_schedule_device_bh(device);
dasd_schedule_requeue(device);
} else if (!dasd_path_get_opm(device) && ifccpm) {
/*
* device has no operational paths but at least one path is
* disabled due to IFCC errors
* trigger path verification on paths with IFCC errors
*/
dasd_path_set_tbvpm(device, ifccpm);
dasd_schedule_device_bh(device);
}
if (oldopm && !dasd_path_get_opm(device) && !hpfpm && !ifccpm) {
dev_warn(&device->cdev->dev,
"No verified channel paths remain for the device\n");
DBF_DEV_EVENT(DBF_WARNING, device,
Expand All @@ -3757,30 +3796,18 @@ int dasd_generic_verify_path(struct dasd_device *device, __u8 lpm)
}
EXPORT_SYMBOL_GPL(dasd_generic_verify_path);


int dasd_generic_pm_freeze(struct ccw_device *cdev)
/*
* clear active requests and requeue them to block layer if possible
*/
static int dasd_generic_requeue_all_requests(struct dasd_device *device)
{
struct dasd_device *device = dasd_device_from_cdev(cdev);
struct list_head freeze_queue;
struct list_head requeue_queue;
struct dasd_ccw_req *cqr, *n;
struct dasd_ccw_req *refers;
int rc;

if (IS_ERR(device))
return PTR_ERR(device);

/* mark device as suspended */
set_bit(DASD_FLAG_SUSPENDED, &device->flags);

if (device->discipline->freeze)
rc = device->discipline->freeze(device);

/* disallow new I/O */
dasd_device_set_stop_bits(device, DASD_STOPPED_PM);

/* clear active requests and requeue them to block layer if possible */
INIT_LIST_HEAD(&freeze_queue);
spin_lock_irq(get_ccwdev_lock(cdev));
INIT_LIST_HEAD(&requeue_queue);
spin_lock_irq(get_ccwdev_lock(device->cdev));
rc = 0;
list_for_each_entry_safe(cqr, n, &device->ccw_queue, devlist) {
/* Check status and move request to flush_queue */
Expand All @@ -3791,25 +3818,22 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev)
dev_err(&device->cdev->dev,
"Unable to terminate request %p "
"on suspend\n", cqr);
spin_unlock_irq(get_ccwdev_lock(cdev));
spin_unlock_irq(get_ccwdev_lock(device->cdev));
dasd_put_device(device);
return rc;
}
}
list_move_tail(&cqr->devlist, &freeze_queue);
list_move_tail(&cqr->devlist, &requeue_queue);
}
spin_unlock_irq(get_ccwdev_lock(cdev));
spin_unlock_irq(get_ccwdev_lock(device->cdev));

list_for_each_entry_safe(cqr, n, &freeze_queue, devlist) {
list_for_each_entry_safe(cqr, n, &requeue_queue, devlist) {
wait_event(dasd_flush_wq,
(cqr->status != DASD_CQR_CLEAR_PENDING));
if (cqr->status == DASD_CQR_CLEARED)
cqr->status = DASD_CQR_QUEUED;

/* requeue requests to blocklayer will only work for
block device requests */
if (_dasd_requeue_request(cqr))
continue;
/* mark sleepon requests as ended */
if (cqr->callback_data == DASD_SLEEPON_START_TAG)
cqr->callback_data = DASD_SLEEPON_END_TAG;

/* remove requests from device and block queue */
list_del_init(&cqr->devlist);
Expand All @@ -3821,6 +3845,14 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev)
dasd_free_erp_request(cqr, cqr->memdev);
cqr = refers;
}

/*
* requeue requests to blocklayer will only work
* for block device requests
*/
if (_dasd_requeue_request(cqr))
continue;

if (cqr->block)
list_del_init(&cqr->blocklist);
cqr->block->base->discipline->free_cp(
Expand All @@ -3831,15 +3863,56 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev)
* if requests remain then they are internal request
* and go back to the device queue
*/
if (!list_empty(&freeze_queue)) {
if (!list_empty(&requeue_queue)) {
/* move freeze_queue to start of the ccw_queue */
spin_lock_irq(get_ccwdev_lock(cdev));
list_splice_tail(&freeze_queue, &device->ccw_queue);
spin_unlock_irq(get_ccwdev_lock(cdev));
spin_lock_irq(get_ccwdev_lock(device->cdev));
list_splice_tail(&requeue_queue, &device->ccw_queue);
spin_unlock_irq(get_ccwdev_lock(device->cdev));
}
dasd_put_device(device);
/* wake up generic waitqueue for eventually ended sleepon requests */
wake_up(&generic_waitq);
return rc;
}

static void do_requeue_requests(struct work_struct *work)
{
struct dasd_device *device = container_of(work, struct dasd_device,
requeue_requests);
dasd_generic_requeue_all_requests(device);
dasd_device_remove_stop_bits(device, DASD_STOPPED_NOT_ACC);
if (device->block)
dasd_schedule_block_bh(device->block);
dasd_put_device(device);
}

void dasd_schedule_requeue(struct dasd_device *device)
{
dasd_get_device(device);
/* queue call to dasd_reload_device to the kernel event daemon. */
if (!schedule_work(&device->requeue_requests))
dasd_put_device(device);
}
EXPORT_SYMBOL(dasd_schedule_requeue);

int dasd_generic_pm_freeze(struct ccw_device *cdev)
{
struct dasd_device *device = dasd_device_from_cdev(cdev);
int rc;

if (IS_ERR(device))
return PTR_ERR(device);

/* mark device as suspended */
set_bit(DASD_FLAG_SUSPENDED, &device->flags);

if (device->discipline->freeze)
rc = device->discipline->freeze(device);

/* disallow new I/O */
dasd_device_set_stop_bits(device, DASD_STOPPED_PM);

return dasd_generic_requeue_all_requests(device);
}
EXPORT_SYMBOL_GPL(dasd_generic_pm_freeze);

int dasd_generic_restore_device(struct ccw_device *cdev)
Expand Down
46 changes: 46 additions & 0 deletions drivers/s390/block/dasd_3990_erp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2208,6 +2208,51 @@ dasd_3990_erp_inspect_32(struct dasd_ccw_req * erp, char *sense)

} /* end dasd_3990_erp_inspect_32 */

static void dasd_3990_erp_disable_path(struct dasd_device *device, __u8 lpum)
{
int pos = pathmask_to_pos(lpum);

/* no remaining path, cannot disable */
if (!(dasd_path_get_opm(device) & ~lpum))
return;

dev_err(&device->cdev->dev,
"Path %x.%02x (pathmask %02x) is disabled - IFCC threshold exceeded\n",
device->path[pos].cssid, device->path[pos].chpid, lpum);
dasd_path_remove_opm(device, lpum);
dasd_path_add_ifccpm(device, lpum);
device->path[pos].errorclk = 0;
atomic_set(&device->path[pos].error_count, 0);
}

static void dasd_3990_erp_account_error(struct dasd_ccw_req *erp)
{
struct dasd_device *device = erp->startdev;
__u8 lpum = erp->refers->irb.esw.esw1.lpum;
int pos = pathmask_to_pos(lpum);
unsigned long long clk;

if (!device->path_thrhld)
return;

clk = get_tod_clock();
/*
* check if the last error is longer ago than the timeout,
* if so reset error state
*/
if ((tod_to_ns(clk - device->path[pos].errorclk) / NSEC_PER_SEC)
>= device->path_interval) {
atomic_set(&device->path[pos].error_count, 0);
device->path[pos].errorclk = 0;
}
atomic_inc(&device->path[pos].error_count);
device->path[pos].errorclk = clk;
/* threshold exceeded disable path if possible */
if (atomic_read(&device->path[pos].error_count) >=
device->path_thrhld)
dasd_3990_erp_disable_path(device, lpum);
}

/*
*****************************************************************************
* main ERP control functions (24 and 32 byte sense)
Expand Down Expand Up @@ -2237,6 +2282,7 @@ dasd_3990_erp_control_check(struct dasd_ccw_req *erp)
| SCHN_STAT_CHN_CTRL_CHK)) {
DBF_DEV_EVENT(DBF_WARNING, device, "%s",
"channel or interface control check");
dasd_3990_erp_account_error(erp);
erp = dasd_3990_erp_action_4(erp, NULL);
}
return erp;
Expand Down
Loading

0 comments on commit a521b04

Please sign in to comment.