Skip to content

Commit

Permalink
vfio/virtio: Add PRE_COPY support for live migration
Browse files Browse the repository at this point in the history
Add PRE_COPY support for live migration.

This functionality may reduce the downtime upon STOP_COPY as of letting
the target machine to get some 'initial data' from the source once the
machine is still in its RUNNING state and let it prepares itself
pre-ahead to get the final STOP_COPY data.

As the Virtio specification does not support reading partial or
incremental device contexts. This means that during the PRE_COPY state,
the vfio-virtio driver reads the full device state.

As the device state can be changed and the benefit is highest when the
pre copy data closely matches the final data we read it in a rate
limiter mode.

This means we avoid reading new data from the device for a specified
time interval after the last read.

With PRE_COPY enabled, we observed a downtime reduction of approximately
70-75% in various scenarios compared to when PRE_COPY was disabled,
while keeping the total migration time nearly the same.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20241113115200.209269-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
  • Loading branch information
Yishai Hadas authored and Alex Williamson committed Nov 13, 2024
1 parent 0bbc82e commit 6cea64b
Show file tree
Hide file tree
Showing 2 changed files with 227 additions and 8 deletions.
4 changes: 4 additions & 0 deletions drivers/vfio/pci/virtio/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

enum virtiovf_migf_state {
VIRTIOVF_MIGF_STATE_ERROR = 1,
VIRTIOVF_MIGF_STATE_PRECOPY = 2,
VIRTIOVF_MIGF_STATE_COMPLETE = 3,
};

enum virtiovf_load_state {
Expand Down Expand Up @@ -57,6 +59,8 @@ struct virtiovf_migration_file {
/* synchronize access to the file state */
struct mutex lock;
loff_t max_pos;
u64 pre_copy_initial_bytes;
struct ratelimit_state pre_copy_rl_state;
u64 record_size;
u32 record_tag;
u8 has_obj_id:1;
Expand Down
231 changes: 223 additions & 8 deletions drivers/vfio/pci/virtio/migrate.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
/* Initial target buffer size */
#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M

static int
virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
u32 ctx_size);

static struct page *
virtiovf_get_migration_page(struct virtiovf_data_buffer *buf,
unsigned long offset)
Expand Down Expand Up @@ -159,6 +163,41 @@ virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id)
VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id);
}

static struct virtiovf_data_buffer *
virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length)
{
struct virtiovf_data_buffer *buf, *temp_buf;
struct list_head free_list;

INIT_LIST_HEAD(&free_list);

spin_lock_irq(&migf->list_lock);
list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
list_del_init(&buf->buf_elm);
if (buf->allocated_length >= length) {
spin_unlock_irq(&migf->list_lock);
goto found;
}
/*
* Prevent holding redundant buffers. Put in a free
* list and call at the end not under the spin lock
* (&migf->list_lock) to minimize its scope usage.
*/
list_add(&buf->buf_elm, &free_list);
}
spin_unlock_irq(&migf->list_lock);
buf = virtiovf_alloc_data_buffer(migf, length);

found:
while ((temp_buf = list_first_entry_or_null(&free_list,
struct virtiovf_data_buffer, buf_elm))) {
list_del(&temp_buf->buf_elm);
virtiovf_free_data_buffer(temp_buf);
}

return buf;
}

static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf)
{
struct virtiovf_data_buffer *entry;
Expand Down Expand Up @@ -345,6 +384,7 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le
{
struct virtiovf_migration_file *migf = filp->private_data;
struct virtiovf_data_buffer *vhca_buf;
bool first_loop_call = true;
bool end_of_data;
ssize_t done = 0;

Expand All @@ -362,6 +402,19 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le
ssize_t count;

vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data);
if (first_loop_call) {
first_loop_call = false;
/* Temporary end of file as part of PRE_COPY */
if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) {
done = -ENOMSG;
goto out_unlock;
}
if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) {
done = -EINVAL;
goto out_unlock;
}
}

if (end_of_data)
goto out_unlock;

Expand All @@ -383,9 +436,101 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le
return done;
}

static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct virtiovf_migration_file *migf = filp->private_data;
struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
struct vfio_precopy_info info = {};
loff_t *pos = &filp->f_pos;
bool end_of_data = false;
unsigned long minsz;
u32 ctx_size = 0;
int ret;

if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
return -ENOTTY;

minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;

if (info.argsz < minsz)
return -EINVAL;

mutex_lock(&virtvdev->state_mutex);
if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
ret = -EINVAL;
goto err_state_unlock;
}

/*
* The virtio specification does not include a PRE_COPY concept.
* Since we can expect the data to remain the same for a certain period,
* we use a rate limiter mechanism before making a call to the device.
*/
if (__ratelimit(&migf->pre_copy_rl_state)) {

ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
&ctx_size);
if (ret)
goto err_state_unlock;
}

mutex_lock(&migf->lock);
if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
ret = -ENODEV;
goto err_migf_unlock;
}

if (migf->pre_copy_initial_bytes > *pos) {
info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
} else {
info.dirty_bytes = migf->max_pos - *pos;
if (!info.dirty_bytes)
end_of_data = true;
info.dirty_bytes += ctx_size;
}

if (!end_of_data || !ctx_size) {
mutex_unlock(&migf->lock);
goto done;
}

mutex_unlock(&migf->lock);
/*
* We finished transferring the current state and the device has a
* dirty state, read a new state.
*/
ret = virtiovf_read_device_context_chunk(migf, ctx_size);
if (ret)
/*
* The machine is running, and context size could be grow, so no reason to mark
* the device state as VIRTIOVF_MIGF_STATE_ERROR.
*/
goto err_state_unlock;

done:
virtiovf_state_mutex_unlock(virtvdev);
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;

err_migf_unlock:
mutex_unlock(&migf->lock);
err_state_unlock:
virtiovf_state_mutex_unlock(virtvdev);
return ret;
}

static const struct file_operations virtiovf_save_fops = {
.owner = THIS_MODULE,
.read = virtiovf_save_read,
.unlocked_ioctl = virtiovf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = virtiovf_release_file,
};

Expand Down Expand Up @@ -429,7 +574,7 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
int nent;
int ret;

buf = virtiovf_alloc_data_buffer(migf, ctx_size);
buf = virtiovf_get_data_buffer(migf, ctx_size);
if (IS_ERR(buf))
return PTR_ERR(buf);

Expand Down Expand Up @@ -464,7 +609,7 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
goto out;

buf->length = res_size;
header_buf = virtiovf_alloc_data_buffer(migf,
header_buf = virtiovf_get_data_buffer(migf,
sizeof(struct virtiovf_migration_header));
if (IS_ERR(header_buf)) {
ret = PTR_ERR(header_buf);
Expand All @@ -489,8 +634,43 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
return ret;
}

static int
virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev)
{
struct virtiovf_migration_file *migf = virtvdev->saving_migf;
u32 ctx_size;
int ret;

if (migf->state == VIRTIOVF_MIGF_STATE_ERROR)
return -ENODEV;

ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
&ctx_size);
if (ret)
goto err;

if (!ctx_size) {
ret = -EINVAL;
goto err;
}

ret = virtiovf_read_device_context_chunk(migf, ctx_size);
if (ret)
goto err;

migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
return 0;

err:
migf->state = VIRTIOVF_MIGF_STATE_ERROR;
return ret;
}

static struct virtiovf_migration_file *
virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev)
virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev,
bool pre_copy)
{
struct virtiovf_migration_file *migf;
u32 ctx_size;
Expand Down Expand Up @@ -541,6 +721,18 @@ virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev)
if (ret)
goto out_clean;

if (pre_copy) {
migf->pre_copy_initial_bytes = migf->max_pos;
/* Arbitrarily set the pre-copy rate limit to 1-second intervals */
ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);
/* Prevent any rate messages upon its usage */
ratelimit_set_flags(&migf->pre_copy_rl_state,
RATELIMIT_MSG_ON_RELEASE);
migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
} else {
migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
}

return migf;

out_clean:
Expand Down Expand Up @@ -950,15 +1142,17 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
return NULL;
}

if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev,
BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED));
if (ret)
return ERR_PTR(ret);
return NULL;
}

if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0);
if (ret)
return ERR_PTR(ret);
Expand All @@ -968,15 +1162,17 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct virtiovf_migration_file *migf;

migf = virtiovf_pci_save_device_data(virtvdev);
migf = virtiovf_pci_save_device_data(virtvdev, false);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
virtvdev->saving_migf = migf;
return migf->filp;
}

if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
virtiovf_disable_fds(virtvdev);
return NULL;
}
Expand All @@ -997,6 +1193,24 @@ virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
return NULL;
}

if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
struct virtiovf_migration_file *migf;

migf = virtiovf_pci_save_device_data(virtvdev, true);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
virtvdev->saving_migf = migf;
return migf->filp;
}

if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
ret = virtiovf_pci_save_device_final_data(virtvdev);
return ret ? ERR_PTR(ret) : NULL;
}

/*
* vfio_mig_get_next_state() does not use arcs other than the above
*/
Expand Down Expand Up @@ -1101,7 +1315,8 @@ void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev)
spin_lock_init(&virtvdev->reset_lock);
virtvdev->core_device.vdev.migration_flags =
VFIO_MIGRATION_STOP_COPY |
VFIO_MIGRATION_P2P;
VFIO_MIGRATION_P2P |
VFIO_MIGRATION_PRE_COPY;
virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops;
}

Expand Down

0 comments on commit 6cea64b

Please sign in to comment.