Skip to content

Commit

Permalink
vfio/mlx5: Introduce vfio precopy ioctl implementation
Browse files Browse the repository at this point in the history
vfio precopy ioctl returns an estimation of data available for
transferring from the device.

Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current
state of the device, and if needed, append the dirty data to the
transfer FD data. This is done by saving a middle state.

As mlx5 runs the SAVE command asynchronously, make sure to query for
incremental data only once there is no active save command.
Running both in parallel, might end-up with a failure in the incremental
query command on un-tracked vhca.

Also, a middle state will be saved only after the previous state has
finished its SAVE command and has been fully transferred, this prevents
endless use resources.

Co-developed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20221206083438.37807-11-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
  • Loading branch information
Yishai Hadas authored and Alex Williamson committed Dec 6, 2022
1 parent 0c9a38f commit 0dce165
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 0 deletions.
16 changes: 16 additions & 0 deletions drivers/vfio/pci/mlx5/cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,25 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
bool inc = query_flags & MLX5VF_QUERY_INC;
int ret;

lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;

/*
* In case PRE_COPY is used, saving_migf is exposed while device is
* running. Make sure to run only once there is no active save command.
* Running both in parallel, might end-up with a failure in the
* incremental query command on un-tracked vhca.
*/
if (inc) {
ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
if (ret)
return ret;
}

MLX5_SET(query_vhca_migration_state_in, in, opcode,
MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
Expand All @@ -82,6 +95,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,

ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
if (inc)
complete(&mvdev->saving_migf->save_comp);

if (ret)
return ret;

Expand Down
111 changes: 111 additions & 0 deletions drivers/vfio/pci/mlx5/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,121 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
wake_up_interruptible(&migf->poll_wait);
}

static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
struct mlx5_vhca_data_buffer *buf;
struct vfio_precopy_info info = {};
loff_t *pos = &filp->f_pos;
unsigned long minsz;
size_t inc_length = 0;
bool end_of_data;
int ret;

if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
return -ENOTTY;

minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);

if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;

if (info.argsz < minsz)
return -EINVAL;

mutex_lock(&mvdev->state_mutex);
if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
ret = -EINVAL;
goto err_state_unlock;
}

/*
* We can't issue a SAVE command when the device is suspended, so as
* part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
* bytes that can't be read.
*/
if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
/*
* Once the query returns it's guaranteed that there is no
* active SAVE command.
* As so, the other code below is safe with the proper locks.
*/
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
MLX5VF_QUERY_INC);
if (ret)
goto err_state_unlock;
}

mutex_lock(&migf->lock);
if (migf->state == MLX5_MIGF_STATE_ERROR) {
ret = -ENODEV;
goto err_migf_unlock;
}

buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
if (buf) {
if (buf->start_pos == 0) {
info.initial_bytes = buf->header_image_size - *pos;
} else if (buf->start_pos ==
sizeof(struct mlx5_vf_migration_header)) {
/* First data buffer following the header */
info.initial_bytes = buf->start_pos +
buf->length - *pos;
} else {
info.dirty_bytes = buf->start_pos + buf->length - *pos;
}
} else {
if (!end_of_data) {
ret = -EINVAL;
goto err_migf_unlock;
}

info.dirty_bytes = inc_length;
}

if (!end_of_data || !inc_length) {
mutex_unlock(&migf->lock);
goto done;
}

mutex_unlock(&migf->lock);
/*
* We finished transferring the current state and the device has a
* dirty state, save a new state to be ready for.
*/
buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
mlx5vf_mark_err(migf);
goto err_state_unlock;
}

ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
if (ret) {
mlx5vf_mark_err(migf);
mlx5vf_put_data_buffer(buf);
goto err_state_unlock;
}

done:
mlx5vf_state_mutex_unlock(mvdev);
return copy_to_user((void __user *)arg, &info, minsz);
err_migf_unlock:
mutex_unlock(&migf->lock);
err_state_unlock:
mlx5vf_state_mutex_unlock(mvdev);
return ret;
}

static const struct file_operations mlx5vf_save_fops = {
.owner = THIS_MODULE,
.read = mlx5vf_save_read,
.poll = mlx5vf_save_poll,
.unlocked_ioctl = mlx5vf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = mlx5vf_release_file,
.llseek = no_llseek,
};
Expand Down

0 comments on commit 0dce165

Please sign in to comment.