Skip to content

Commit

Permalink
accel/ivpu: Abort all jobs after command queue unregister
Browse files Browse the repository at this point in the history
With hardware scheduler it is not expected to receive JOB_DONE
notifications from NPU FW for the jobs aborted due to command queue destroy
JSM command.

Remove jobs submitted to unregistered command queue from submitted_jobs_xa
to avoid triggering a TDR in such case.

Add explicit submitted_jobs_lock that protects access to list of submitted
jobs which is now used to find jobs to abort.

Move context abort procedure to separate work queue not to slow down
handling of IPCs or DCT requests in case where job abort takes longer,
especially when destruction of the last job of a specific context results
in context release.

Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
Signed-off-by: Maciej Falkowski <maciej.falkowski@linux.intel.com>
Reviewed-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250107173238.381120-4-maciej.falkowski@linux.intel.com
  • Loading branch information
Karol Wachowski authored and Jacek Lawrynowicz committed Jan 9, 2025
1 parent 465a391 commit 5bbccad
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 62 deletions.
32 changes: 6 additions & 26 deletions drivers/accel/ivpu/ivpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@
#define DRIVER_VERSION_STR "1.0.0 " UTS_RELEASE
#endif

static struct lock_class_key submitted_jobs_xa_lock_class_key;

int ivpu_dbg_mask;
module_param_named(dbg_mask, ivpu_dbg_mask, int, 0644);
MODULE_PARM_DESC(dbg_mask, "Driver debug mask. See IVPU_DBG_* macros.");
Expand Down Expand Up @@ -467,26 +465,6 @@ static const struct drm_driver driver = {
.major = 1,
};

static void ivpu_context_abort_invalid(struct ivpu_device *vdev)
{
struct ivpu_file_priv *file_priv;
unsigned long ctx_id;

mutex_lock(&vdev->context_list_lock);

xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
if (!file_priv->has_mmu_faults || file_priv->aborted)
continue;

mutex_lock(&file_priv->lock);
ivpu_context_abort_locked(file_priv);
file_priv->aborted = true;
mutex_unlock(&file_priv->lock);
}

mutex_unlock(&vdev->context_list_lock);
}

static irqreturn_t ivpu_irq_thread_handler(int irq, void *arg)
{
struct ivpu_device *vdev = arg;
Expand All @@ -500,9 +478,6 @@ static irqreturn_t ivpu_irq_thread_handler(int irq, void *arg)
case IVPU_HW_IRQ_SRC_IPC:
ivpu_ipc_irq_thread_handler(vdev);
break;
case IVPU_HW_IRQ_SRC_MMU_EVTQ:
ivpu_context_abort_invalid(vdev);
break;
case IVPU_HW_IRQ_SRC_DCT:
ivpu_pm_dct_irq_thread_handler(vdev);
break;
Expand Down Expand Up @@ -619,16 +594,21 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key);
INIT_LIST_HEAD(&vdev->bo_list);

vdev->db_limit.min = IVPU_MIN_DB;
vdev->db_limit.max = IVPU_MAX_DB;

INIT_WORK(&vdev->context_abort_work, ivpu_context_abort_thread_handler);

ret = drmm_mutex_init(&vdev->drm, &vdev->context_list_lock);
if (ret)
goto err_xa_destroy;

ret = drmm_mutex_init(&vdev->drm, &vdev->submitted_jobs_lock);
if (ret)
goto err_xa_destroy;

ret = drmm_mutex_init(&vdev->drm, &vdev->bo_list_lock);
if (ret)
goto err_xa_destroy;
Expand Down
2 changes: 2 additions & 0 deletions drivers/accel/ivpu/ivpu_drv.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ struct ivpu_device {
struct mutex context_list_lock; /* Protects user context addition/removal */
struct xarray context_xa;
struct xa_limit context_xa_limit;
struct work_struct context_abort_work;

struct xarray db_xa;
struct xa_limit db_limit;
Expand All @@ -145,6 +146,7 @@ struct ivpu_device {
struct mutex bo_list_lock; /* Protects bo_list */
struct list_head bo_list;

struct mutex submitted_jobs_lock; /* Protects submitted_jobs */
struct xarray submitted_jobs_xa;
struct ivpu_ipc_consumer job_done_consumer;

Expand Down
137 changes: 105 additions & 32 deletions drivers/accel/ivpu/ivpu_job.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ static struct ivpu_cmdq *ivpu_cmdq_create(struct ivpu_file_priv *file_priv, u8 p

cmdq->priority = priority;
cmdq->is_legacy = is_legacy;
cmdq->is_valid = true;

ret = xa_alloc_cyclic(&file_priv->cmdq_xa, &cmdq->id, cmdq, file_priv->cmdq_limit,
&file_priv->cmdq_id_next, GFP_KERNEL);
Expand All @@ -130,6 +131,7 @@ static struct ivpu_cmdq *ivpu_cmdq_create(struct ivpu_file_priv *file_priv, u8 p
goto err_free_cmdq;
}

ivpu_dbg(vdev, JOB, "Command queue %d created, ctx %d\n", cmdq->id, file_priv->ctx.id);
return cmdq;

err_free_cmdq:
Expand Down Expand Up @@ -247,7 +249,8 @@ static int ivpu_cmdq_unregister(struct ivpu_file_priv *file_priv, struct ivpu_cm
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) {
ret = ivpu_jsm_hws_destroy_cmdq(vdev, file_priv->ctx.id, cmdq->id);
if (!ret)
ivpu_dbg(vdev, JOB, "Command queue %d destroyed\n", cmdq->id);
ivpu_dbg(vdev, JOB, "Command queue %d destroyed, ctx %d\n",
cmdq->id, file_priv->ctx.id);
}

ret = ivpu_jsm_unregister_db(vdev, cmdq->db_id);
Expand Down Expand Up @@ -303,8 +306,8 @@ static struct ivpu_cmdq *ivpu_cmdq_acquire(struct ivpu_file_priv *file_priv, u32
lockdep_assert_held(&file_priv->lock);

cmdq = xa_load(&file_priv->cmdq_xa, cmdq_id);
if (!cmdq) {
ivpu_err(vdev, "Failed to find command queue with ID: %u\n", cmdq_id);
if (!cmdq || !cmdq->is_valid) {
ivpu_warn_ratelimited(vdev, "Failed to find command queue with ID: %u\n", cmdq_id);
return NULL;
}

Expand Down Expand Up @@ -356,25 +359,21 @@ void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev)
mutex_unlock(&vdev->context_list_lock);
}

static void ivpu_cmdq_unregister_all(struct ivpu_file_priv *file_priv)
{
struct ivpu_cmdq *cmdq;
unsigned long cmdq_id;

xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq)
ivpu_cmdq_unregister(file_priv, cmdq);
}

void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv)
{
struct ivpu_device *vdev = file_priv->vdev;
struct ivpu_cmdq *cmdq;
unsigned long cmdq_id;

lockdep_assert_held(&file_priv->lock);

ivpu_cmdq_unregister_all(file_priv);
xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq)
ivpu_cmdq_unregister(file_priv, cmdq);

if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS)
ivpu_jsm_context_release(vdev, file_priv->ctx.id);

file_priv->aborted = true;
}

static int ivpu_cmdq_push_job(struct ivpu_cmdq *cmdq, struct ivpu_job *job)
Expand Down Expand Up @@ -513,23 +512,23 @@ static struct ivpu_job *ivpu_job_remove_from_submitted_jobs(struct ivpu_device *
{
struct ivpu_job *job;

xa_lock(&vdev->submitted_jobs_xa);
job = __xa_erase(&vdev->submitted_jobs_xa, job_id);
lockdep_assert_held(&vdev->submitted_jobs_lock);

job = xa_erase(&vdev->submitted_jobs_xa, job_id);
if (xa_empty(&vdev->submitted_jobs_xa) && job) {
vdev->busy_time = ktime_add(ktime_sub(ktime_get(), vdev->busy_start_ts),
vdev->busy_time);
}

xa_unlock(&vdev->submitted_jobs_xa);

return job;
}

static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status)
{
struct ivpu_job *job;

lockdep_assert_held(&vdev->submitted_jobs_lock);

job = ivpu_job_remove_from_submitted_jobs(vdev, job_id);
if (!job)
return -ENOENT;
Expand All @@ -548,6 +547,10 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32
ivpu_stop_job_timeout_detection(vdev);

ivpu_rpm_put(vdev);

if (!xa_empty(&vdev->submitted_jobs_xa))
ivpu_start_job_timeout_detection(vdev);

return 0;
}

Expand All @@ -556,8 +559,26 @@ void ivpu_jobs_abort_all(struct ivpu_device *vdev)
struct ivpu_job *job;
unsigned long id;

mutex_lock(&vdev->submitted_jobs_lock);

xa_for_each(&vdev->submitted_jobs_xa, id, job)
ivpu_job_signal_and_destroy(vdev, id, DRM_IVPU_JOB_STATUS_ABORTED);

mutex_unlock(&vdev->submitted_jobs_lock);
}

void ivpu_cmdq_abort_all_jobs(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id)
{
struct ivpu_job *job;
unsigned long id;

mutex_lock(&vdev->submitted_jobs_lock);

xa_for_each(&vdev->submitted_jobs_xa, id, job)
if (job->file_priv->ctx.id == ctx_id && job->cmdq_id == cmdq_id)
ivpu_job_signal_and_destroy(vdev, id, DRM_IVPU_JOB_STATUS_ABORTED);

mutex_unlock(&vdev->submitted_jobs_lock);
}

static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id)
Expand Down Expand Up @@ -590,15 +611,18 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id)
goto err_unlock_file_priv;
}

xa_lock(&vdev->submitted_jobs_xa);
job->cmdq_id = cmdq->id;

mutex_lock(&vdev->submitted_jobs_lock);

is_first_job = xa_empty(&vdev->submitted_jobs_xa);
ret = __xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit,
&file_priv->job_id_next, GFP_KERNEL);
ret = xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit,
&file_priv->job_id_next, GFP_KERNEL);
if (ret < 0) {
ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n",
file_priv->ctx.id);
ret = -EBUSY;
goto err_unlock_submitted_jobs_xa;
goto err_unlock_submitted_jobs;
}

ret = ivpu_cmdq_push_job(cmdq, job);
Expand All @@ -621,19 +645,21 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id)
job->job_id, file_priv->ctx.id, job->engine_idx, cmdq->priority,
job->cmd_buf_vpu_addr, cmdq->jobq->header.tail);

xa_unlock(&vdev->submitted_jobs_xa);

mutex_unlock(&vdev->submitted_jobs_lock);
mutex_unlock(&file_priv->lock);

if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW))
if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW)) {
mutex_lock(&vdev->submitted_jobs_lock);
ivpu_job_signal_and_destroy(vdev, job->job_id, VPU_JSM_STATUS_SUCCESS);
mutex_unlock(&vdev->submitted_jobs_lock);
}

return 0;

err_erase_xa:
__xa_erase(&vdev->submitted_jobs_xa, job->job_id);
err_unlock_submitted_jobs_xa:
xa_unlock(&vdev->submitted_jobs_xa);
xa_erase(&vdev->submitted_jobs_xa, job->job_id);
err_unlock_submitted_jobs:
mutex_unlock(&vdev->submitted_jobs_lock);
err_unlock_file_priv:
mutex_unlock(&file_priv->lock);
ivpu_rpm_put(vdev);
Expand Down Expand Up @@ -843,19 +869,33 @@ int ivpu_cmdq_create_ioctl(struct drm_device *dev, void *data, struct drm_file *
int ivpu_cmdq_destroy_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
{
struct ivpu_file_priv *file_priv = file->driver_priv;
struct ivpu_device *vdev = file_priv->vdev;
struct drm_ivpu_cmdq_destroy *args = data;
struct ivpu_cmdq *cmdq;
u32 cmdq_id;
int ret = 0;

mutex_lock(&file_priv->lock);

cmdq = xa_load(&file_priv->cmdq_xa, args->cmdq_id);
if (!cmdq || cmdq->is_legacy) {
if (!cmdq || !cmdq->is_valid || cmdq->is_legacy) {
ret = -ENOENT;
goto unlock;
}

/*
* There is no way to stop executing jobs per command queue
* in OS scheduling mode, mark command queue as invalid instead
* and it will be freed together with context release.
*/
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) {
cmdq->is_valid = false;
goto unlock;
}

cmdq_id = cmdq->id;
ivpu_cmdq_destroy(file_priv, cmdq);
ivpu_cmdq_abort_all_jobs(vdev, file_priv->ctx.id, cmdq_id);
unlock:
mutex_unlock(&file_priv->lock);
return ret;
Expand All @@ -866,7 +906,6 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr,
struct vpu_jsm_msg *jsm_msg)
{
struct vpu_ipc_msg_payload_job_done *payload;
int ret;

if (!jsm_msg) {
ivpu_err(vdev, "IPC message has no JSM payload\n");
Expand All @@ -879,9 +918,10 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr,
}

payload = (struct vpu_ipc_msg_payload_job_done *)&jsm_msg->payload;
ret = ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status);
if (!ret && !xa_empty(&vdev->submitted_jobs_xa))
ivpu_start_job_timeout_detection(vdev);

mutex_lock(&vdev->submitted_jobs_lock);
ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status);
mutex_unlock(&vdev->submitted_jobs_lock);
}

void ivpu_job_done_consumer_init(struct ivpu_device *vdev)
Expand All @@ -894,3 +934,36 @@ void ivpu_job_done_consumer_fini(struct ivpu_device *vdev)
{
ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer);
}

void ivpu_context_abort_thread_handler(struct work_struct *work)
{
struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work);
struct ivpu_file_priv *file_priv;
unsigned long ctx_id;
struct ivpu_job *job;
unsigned long id;

mutex_lock(&vdev->context_list_lock);
xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
if (!file_priv->has_mmu_faults || file_priv->aborted)
continue;

mutex_lock(&file_priv->lock);
ivpu_context_abort_locked(file_priv);
mutex_unlock(&file_priv->lock);
}
mutex_unlock(&vdev->context_list_lock);

if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW)
return;
/*
* In hardware scheduling mode NPU already has stopped processing jobs
* and won't send us any further notifications, thus we have to free job related resources
* and notify userspace
*/
mutex_lock(&vdev->submitted_jobs_lock);
xa_for_each(&vdev->submitted_jobs_xa, id, job)
if (job->file_priv->aborted)
ivpu_job_signal_and_destroy(vdev, job->job_id, DRM_IVPU_JOB_STATUS_ABORTED);
mutex_unlock(&vdev->submitted_jobs_lock);
}
4 changes: 4 additions & 0 deletions drivers/accel/ivpu/ivpu_job.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ struct ivpu_cmdq {
u32 id;
u32 db_id;
u8 priority;
bool is_valid;
bool is_legacy;
};

Expand All @@ -51,6 +52,7 @@ struct ivpu_job {
struct ivpu_file_priv *file_priv;
struct dma_fence *done_fence;
u64 cmd_buf_vpu_addr;
u32 cmdq_id;
u32 job_id;
u32 engine_idx;
size_t bo_count;
Expand All @@ -66,9 +68,11 @@ void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv);

void ivpu_cmdq_release_all_locked(struct ivpu_file_priv *file_priv);
void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev);
void ivpu_cmdq_abort_all_jobs(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id);

void ivpu_job_done_consumer_init(struct ivpu_device *vdev);
void ivpu_job_done_consumer_fini(struct ivpu_device *vdev);
void ivpu_context_abort_thread_handler(struct work_struct *work);

void ivpu_jobs_abort_all(struct ivpu_device *vdev);

Expand Down
Loading

0 comments on commit 5bbccad

Please sign in to comment.