Skip to content

Commit

Permalink
nvme-tcp-offload: Add controller level error recovery implementation
Browse files Browse the repository at this point in the history
In this patch, we implement controller level error handling and recovery.
Upon an error discovered by the ULP or reset controller initiated by the
nvme-core (using reset_ctrl workqueue), the ULP will initiate a controller
recovery which includes teardown and re-connect of all queues.

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Arie Gershberg <agershberg@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Arie Gershberg authored and David S. Miller committed Jun 3, 2021
1 parent 5aadd5f commit 5faf6d6
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 2 deletions.
127 changes: 125 additions & 2 deletions drivers/nvme/host/tcp-offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,23 @@ void nvme_tcp_ofld_unregister_dev(struct nvme_tcp_ofld_dev *dev)
}
EXPORT_SYMBOL_GPL(nvme_tcp_ofld_unregister_dev);

/**
* nvme_tcp_ofld_error_recovery() - NVMeTCP Offload library error recovery.
* function.
* @nctrl: NVMe controller instance to change to resetting.
*
* API function that change the controller state to resseting.
* Part of the overall controller reset sequence.
*/
void nvme_tcp_ofld_error_recovery(struct nvme_ctrl *nctrl)
{
if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_RESETTING))
return;

queue_work(nvme_reset_wq, &to_tcp_ofld_ctrl(nctrl)->err_work);
}
EXPORT_SYMBOL_GPL(nvme_tcp_ofld_error_recovery);

/**
* nvme_tcp_ofld_report_queue_err() - NVMeTCP Offload report error event
* callback function. Pointed to by nvme_tcp_ofld_queue->report_err.
Expand All @@ -82,7 +99,8 @@ EXPORT_SYMBOL_GPL(nvme_tcp_ofld_unregister_dev);
*/
int nvme_tcp_ofld_report_queue_err(struct nvme_tcp_ofld_queue *queue)
{
/* Placeholder - invoke error recovery flow */
pr_err("nvme-tcp-offload queue error\n");
nvme_tcp_ofld_error_recovery(&queue->ctrl->nctrl);

return 0;
}
Expand Down Expand Up @@ -287,6 +305,28 @@ nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new)
return rc;
}

static void nvme_tcp_ofld_reconnect_or_remove(struct nvme_ctrl *nctrl)
{
/* If we are resetting/deleting then do nothing */
if (nctrl->state != NVME_CTRL_CONNECTING) {
WARN_ON_ONCE(nctrl->state == NVME_CTRL_NEW ||
nctrl->state == NVME_CTRL_LIVE);

return;
}

if (nvmf_should_reconnect(nctrl)) {
dev_info(nctrl->device, "Reconnecting in %d seconds...\n",
nctrl->opts->reconnect_delay);
queue_delayed_work(nvme_wq,
&to_tcp_ofld_ctrl(nctrl)->connect_work,
nctrl->opts->reconnect_delay * HZ);
} else {
dev_info(nctrl->device, "Removing controller...\n");
nvme_delete_ctrl(nctrl);
}
}

static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new)
{
struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);
Expand Down Expand Up @@ -410,10 +450,63 @@ nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove)
/* Placeholder - teardown_io_queues */
}

static void nvme_tcp_ofld_reconnect_ctrl_work(struct work_struct *work)
{
struct nvme_tcp_ofld_ctrl *ctrl =
container_of(to_delayed_work(work),
struct nvme_tcp_ofld_ctrl,
connect_work);
struct nvme_ctrl *nctrl = &ctrl->nctrl;

++nctrl->nr_reconnects;

if (nvme_tcp_ofld_setup_ctrl(nctrl, false))
goto requeue;

dev_info(nctrl->device, "Successfully reconnected (%d attempt)\n",
nctrl->nr_reconnects);

nctrl->nr_reconnects = 0;

return;

requeue:
dev_info(nctrl->device, "Failed reconnect attempt %d\n",
nctrl->nr_reconnects);
nvme_tcp_ofld_reconnect_or_remove(nctrl);
}

static void nvme_tcp_ofld_error_recovery_work(struct work_struct *work)
{
struct nvme_tcp_ofld_ctrl *ctrl =
container_of(work, struct nvme_tcp_ofld_ctrl, err_work);
struct nvme_ctrl *nctrl = &ctrl->nctrl;

nvme_stop_keep_alive(nctrl);
nvme_tcp_ofld_teardown_io_queues(nctrl, false);
/* unquiesce to fail fast pending requests */
nvme_start_queues(nctrl);
nvme_tcp_ofld_teardown_admin_queue(nctrl, false);
blk_mq_unquiesce_queue(nctrl->admin_q);

if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started nctrl delete */
WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&
nctrl->state != NVME_CTRL_DELETING_NOIO);

return;
}

nvme_tcp_ofld_reconnect_or_remove(nctrl);
}

static void
nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown)
{
/* Placeholder - err_work and connect_work */
struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);

cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->connect_work);
nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown);
blk_mq_quiesce_queue(nctrl->admin_q);
if (shutdown)
Expand All @@ -428,6 +521,32 @@ static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl)
nvme_tcp_ofld_teardown_ctrl(nctrl, true);
}

static void nvme_tcp_ofld_reset_ctrl_work(struct work_struct *work)
{
struct nvme_ctrl *nctrl =
container_of(work, struct nvme_ctrl, reset_work);

nvme_stop_ctrl(nctrl);
nvme_tcp_ofld_teardown_ctrl(nctrl, false);

if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&
nctrl->state != NVME_CTRL_DELETING_NOIO);

return;
}

if (nvme_tcp_ofld_setup_ctrl(nctrl, false))
goto out_fail;

return;

out_fail:
++nctrl->nr_reconnects;
nvme_tcp_ofld_reconnect_or_remove(nctrl);
}

static int
nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set,
struct request *rq,
Expand Down Expand Up @@ -521,6 +640,10 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)
opts->nr_poll_queues + 1;
nctrl->sqsize = opts->queue_size - 1;
nctrl->kato = opts->kato;
INIT_DELAYED_WORK(&ctrl->connect_work,
nvme_tcp_ofld_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_tcp_ofld_error_recovery_work);
INIT_WORK(&nctrl->reset_work, nvme_tcp_ofld_reset_ctrl_work);
if (!(opts->mask & NVMF_OPT_TRSVCID)) {
opts->trsvcid =
kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
Expand Down
1 change: 1 addition & 0 deletions drivers/nvme/host/tcp-offload.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,4 @@ struct nvme_tcp_ofld_ops {
/* Exported functions for lower vendor specific offload drivers */
int nvme_tcp_ofld_register_dev(struct nvme_tcp_ofld_dev *dev);
void nvme_tcp_ofld_unregister_dev(struct nvme_tcp_ofld_dev *dev);
void nvme_tcp_ofld_error_recovery(struct nvme_ctrl *nctrl);

0 comments on commit 5faf6d6

Please sign in to comment.