linux/drivers/vfio/pci/virtio/migrate.c
Yishai Hadas 6cea64b1db vfio/virtio: Add PRE_COPY support for live migration
Add PRE_COPY support for live migration.

This functionality may reduce the downtime upon STOP_COPY as of letting
the target machine to get some 'initial data' from the source once the
machine is still in its RUNNING state and let it prepares itself
pre-ahead to get the final STOP_COPY data.

As the Virtio specification does not support reading partial or
incremental device contexts. This means that during the PRE_COPY state,
the vfio-virtio driver reads the full device state.

As the device state can be changed and the benefit is highest when the
pre copy data closely matches the final data we read it in a rate
limiter mode.

This means we avoid reading new data from the device for a specified
time interval after the last read.

With PRE_COPY enabled, we observed a downtime reduction of approximately
70-75% in various scenarios compared to when PRE_COPY was disabled,
while keeping the total migration time nearly the same.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20241113115200.209269-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2024-11-13 16:28:32 -07:00

1337 lines
34 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#include <linux/device.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/vfio_pci_core.h>
#include <linux/virtio_pci.h>
#include <linux/virtio_net.h>
#include <linux/virtio_pci_admin.h>
#include <linux/anon_inodes.h>
#include "common.h"
/* Device specification max parts size */
#define MAX_LOAD_SIZE (BIT_ULL(BITS_PER_TYPE \
(((struct virtio_admin_cmd_dev_parts_metadata_result *)0)->parts_size.size)) - 1)
/* Initial target buffer size */
#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M
static int
virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
u32 ctx_size);
static struct page *
virtiovf_get_migration_page(struct virtiovf_data_buffer *buf,
unsigned long offset)
{
unsigned long cur_offset = 0;
struct scatterlist *sg;
unsigned int i;
/* All accesses are sequential */
if (offset < buf->last_offset || !buf->last_offset_sg) {
buf->last_offset = 0;
buf->last_offset_sg = buf->table.sgt.sgl;
buf->sg_last_entry = 0;
}
cur_offset = buf->last_offset;
for_each_sg(buf->last_offset_sg, sg,
buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
if (offset < sg->length + cur_offset) {
buf->last_offset_sg = sg;
buf->sg_last_entry += i;
buf->last_offset = cur_offset;
return nth_page(sg_page(sg),
(offset - cur_offset) / PAGE_SIZE);
}
cur_offset += sg->length;
}
return NULL;
}
static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf,
unsigned int npages)
{
unsigned int to_alloc = npages;
struct page **page_list;
unsigned long filled;
unsigned int to_fill;
int ret;
int i;
to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
page_list = kvcalloc(to_fill, sizeof(*page_list), GFP_KERNEL_ACCOUNT);
if (!page_list)
return -ENOMEM;
do {
filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
page_list);
if (!filled) {
ret = -ENOMEM;
goto err;
}
to_alloc -= filled;
ret = sg_alloc_append_table_from_pages(&buf->table, page_list,
filled, 0, filled << PAGE_SHIFT, UINT_MAX,
SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT);
if (ret)
goto err_append;
buf->allocated_length += filled * PAGE_SIZE;
/* clean input for another bulk allocation */
memset(page_list, 0, filled * sizeof(*page_list));
to_fill = min_t(unsigned int, to_alloc,
PAGE_SIZE / sizeof(*page_list));
} while (to_alloc > 0);
kvfree(page_list);
return 0;
err_append:
for (i = filled - 1; i >= 0; i--)
__free_page(page_list[i]);
err:
kvfree(page_list);
return ret;
}
static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf)
{
struct sg_page_iter sg_iter;
/* Undo alloc_pages_bulk_array() */
for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
__free_page(sg_page_iter_page(&sg_iter));
sg_free_append_table(&buf->table);
kfree(buf);
}
static struct virtiovf_data_buffer *
virtiovf_alloc_data_buffer(struct virtiovf_migration_file *migf, size_t length)
{
struct virtiovf_data_buffer *buf;
int ret;
buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
if (!buf)
return ERR_PTR(-ENOMEM);
ret = virtiovf_add_migration_pages(buf,
DIV_ROUND_UP_ULL(length, PAGE_SIZE));
if (ret)
goto end;
buf->migf = migf;
return buf;
end:
virtiovf_free_data_buffer(buf);
return ERR_PTR(ret);
}
static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf)
{
spin_lock_irq(&buf->migf->list_lock);
list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
spin_unlock_irq(&buf->migf->list_lock);
}
static int
virtiovf_pci_alloc_obj_id(struct virtiovf_pci_core_device *virtvdev, u8 type,
u32 *obj_id)
{
return virtio_pci_admin_obj_create(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, type, obj_id);
}
static void
virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id)
{
virtio_pci_admin_obj_destroy(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id);
}
static struct virtiovf_data_buffer *
virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length)
{
struct virtiovf_data_buffer *buf, *temp_buf;
struct list_head free_list;
INIT_LIST_HEAD(&free_list);
spin_lock_irq(&migf->list_lock);
list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
list_del_init(&buf->buf_elm);
if (buf->allocated_length >= length) {
spin_unlock_irq(&migf->list_lock);
goto found;
}
/*
* Prevent holding redundant buffers. Put in a free
* list and call at the end not under the spin lock
* (&migf->list_lock) to minimize its scope usage.
*/
list_add(&buf->buf_elm, &free_list);
}
spin_unlock_irq(&migf->list_lock);
buf = virtiovf_alloc_data_buffer(migf, length);
found:
while ((temp_buf = list_first_entry_or_null(&free_list,
struct virtiovf_data_buffer, buf_elm))) {
list_del(&temp_buf->buf_elm);
virtiovf_free_data_buffer(temp_buf);
}
return buf;
}
static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf)
{
struct virtiovf_data_buffer *entry;
if (migf->buf) {
virtiovf_free_data_buffer(migf->buf);
migf->buf = NULL;
}
if (migf->buf_header) {
virtiovf_free_data_buffer(migf->buf_header);
migf->buf_header = NULL;
}
list_splice(&migf->avail_list, &migf->buf_list);
while ((entry = list_first_entry_or_null(&migf->buf_list,
struct virtiovf_data_buffer, buf_elm))) {
list_del(&entry->buf_elm);
virtiovf_free_data_buffer(entry);
}
if (migf->has_obj_id)
virtiovf_pci_free_obj_id(migf->virtvdev, migf->obj_id);
}
static void virtiovf_disable_fd(struct virtiovf_migration_file *migf)
{
mutex_lock(&migf->lock);
migf->state = VIRTIOVF_MIGF_STATE_ERROR;
migf->filp->f_pos = 0;
mutex_unlock(&migf->lock);
}
static void virtiovf_disable_fds(struct virtiovf_pci_core_device *virtvdev)
{
if (virtvdev->resuming_migf) {
virtiovf_disable_fd(virtvdev->resuming_migf);
virtiovf_clean_migf_resources(virtvdev->resuming_migf);
fput(virtvdev->resuming_migf->filp);
virtvdev->resuming_migf = NULL;
}
if (virtvdev->saving_migf) {
virtiovf_disable_fd(virtvdev->saving_migf);
virtiovf_clean_migf_resources(virtvdev->saving_migf);
fput(virtvdev->saving_migf->filp);
virtvdev->saving_migf = NULL;
}
}
/*
* This function is called in all state_mutex unlock cases to
* handle a 'deferred_reset' if exists.
*/
static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device *virtvdev)
{
again:
spin_lock(&virtvdev->reset_lock);
if (virtvdev->deferred_reset) {
virtvdev->deferred_reset = false;
spin_unlock(&virtvdev->reset_lock);
virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
virtiovf_disable_fds(virtvdev);
goto again;
}
mutex_unlock(&virtvdev->state_mutex);
spin_unlock(&virtvdev->reset_lock);
}
void virtiovf_migration_reset_done(struct pci_dev *pdev)
{
struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
if (!virtvdev->migrate_cap)
return;
/*
* As the higher VFIO layers are holding locks across reset and using
* those same locks with the mm_lock we need to prevent ABBA deadlock
* with the state_mutex and mm_lock.
* In case the state_mutex was taken already we defer the cleanup work
* to the unlock flow of the other running context.
*/
spin_lock(&virtvdev->reset_lock);
virtvdev->deferred_reset = true;
if (!mutex_trylock(&virtvdev->state_mutex)) {
spin_unlock(&virtvdev->reset_lock);
return;
}
spin_unlock(&virtvdev->reset_lock);
virtiovf_state_mutex_unlock(virtvdev);
}
static int virtiovf_release_file(struct inode *inode, struct file *filp)
{
struct virtiovf_migration_file *migf = filp->private_data;
virtiovf_disable_fd(migf);
mutex_destroy(&migf->lock);
kfree(migf);
return 0;
}
static struct virtiovf_data_buffer *
virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf,
loff_t pos, bool *end_of_data)
{
struct virtiovf_data_buffer *buf;
bool found = false;
*end_of_data = false;
spin_lock_irq(&migf->list_lock);
if (list_empty(&migf->buf_list)) {
*end_of_data = true;
goto end;
}
buf = list_first_entry(&migf->buf_list, struct virtiovf_data_buffer,
buf_elm);
if (pos >= buf->start_pos &&
pos < buf->start_pos + buf->length) {
found = true;
goto end;
}
/*
* As we use a stream based FD we may expect having the data always
* on first chunk
*/
migf->state = VIRTIOVF_MIGF_STATE_ERROR;
end:
spin_unlock_irq(&migf->list_lock);
return found ? buf : NULL;
}
static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf,
char __user **buf, size_t *len, loff_t *pos)
{
unsigned long offset;
ssize_t done = 0;
size_t copy_len;
copy_len = min_t(size_t,
vhca_buf->start_pos + vhca_buf->length - *pos, *len);
while (copy_len) {
size_t page_offset;
struct page *page;
size_t page_len;
u8 *from_buff;
int ret;
offset = *pos - vhca_buf->start_pos;
page_offset = offset % PAGE_SIZE;
offset -= page_offset;
page = virtiovf_get_migration_page(vhca_buf, offset);
if (!page)
return -EINVAL;
page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
from_buff = kmap_local_page(page);
ret = copy_to_user(*buf, from_buff + page_offset, page_len);
kunmap_local(from_buff);
if (ret)
return -EFAULT;
*pos += page_len;
*len -= page_len;
*buf += page_len;
done += page_len;
copy_len -= page_len;
}
if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
spin_lock_irq(&vhca_buf->migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
spin_unlock_irq(&vhca_buf->migf->list_lock);
}
return done;
}
static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
struct virtiovf_migration_file *migf = filp->private_data;
struct virtiovf_data_buffer *vhca_buf;
bool first_loop_call = true;
bool end_of_data;
ssize_t done = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
mutex_lock(&migf->lock);
if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
done = -ENODEV;
goto out_unlock;
}
while (len) {
ssize_t count;
vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data);
if (first_loop_call) {
first_loop_call = false;
/* Temporary end of file as part of PRE_COPY */
if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) {
done = -ENOMSG;
goto out_unlock;
}
if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) {
done = -EINVAL;
goto out_unlock;
}
}
if (end_of_data)
goto out_unlock;
if (!vhca_buf) {
done = -EINVAL;
goto out_unlock;
}
count = virtiovf_buf_read(vhca_buf, &buf, &len, pos);
if (count < 0) {
done = count;
goto out_unlock;
}
done += count;
}
out_unlock:
mutex_unlock(&migf->lock);
return done;
}
static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct virtiovf_migration_file *migf = filp->private_data;
struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
struct vfio_precopy_info info = {};
loff_t *pos = &filp->f_pos;
bool end_of_data = false;
unsigned long minsz;
u32 ctx_size = 0;
int ret;
if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
return -ENOTTY;
minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
mutex_lock(&virtvdev->state_mutex);
if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
ret = -EINVAL;
goto err_state_unlock;
}
/*
* The virtio specification does not include a PRE_COPY concept.
* Since we can expect the data to remain the same for a certain period,
* we use a rate limiter mechanism before making a call to the device.
*/
if (__ratelimit(&migf->pre_copy_rl_state)) {
ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
&ctx_size);
if (ret)
goto err_state_unlock;
}
mutex_lock(&migf->lock);
if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
ret = -ENODEV;
goto err_migf_unlock;
}
if (migf->pre_copy_initial_bytes > *pos) {
info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
} else {
info.dirty_bytes = migf->max_pos - *pos;
if (!info.dirty_bytes)
end_of_data = true;
info.dirty_bytes += ctx_size;
}
if (!end_of_data || !ctx_size) {
mutex_unlock(&migf->lock);
goto done;
}
mutex_unlock(&migf->lock);
/*
* We finished transferring the current state and the device has a
* dirty state, read a new state.
*/
ret = virtiovf_read_device_context_chunk(migf, ctx_size);
if (ret)
/*
* The machine is running, and context size could be grow, so no reason to mark
* the device state as VIRTIOVF_MIGF_STATE_ERROR.
*/
goto err_state_unlock;
done:
virtiovf_state_mutex_unlock(virtvdev);
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;
err_migf_unlock:
mutex_unlock(&migf->lock);
err_state_unlock:
virtiovf_state_mutex_unlock(virtvdev);
return ret;
}
static const struct file_operations virtiovf_save_fops = {
.owner = THIS_MODULE,
.read = virtiovf_save_read,
.unlocked_ioctl = virtiovf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = virtiovf_release_file,
};
static int
virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf,
u32 data_size)
{
struct virtiovf_migration_file *migf = header_buf->migf;
struct virtiovf_migration_header header = {};
struct page *page;
u8 *to_buff;
header.record_size = cpu_to_le64(data_size);
header.flags = cpu_to_le32(VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY);
header.tag = cpu_to_le32(VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA);
page = virtiovf_get_migration_page(header_buf, 0);
if (!page)
return -EINVAL;
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
kunmap_local(to_buff);
header_buf->length = sizeof(header);
header_buf->start_pos = header_buf->migf->max_pos;
migf->max_pos += header_buf->length;
spin_lock_irq(&migf->list_lock);
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
spin_unlock_irq(&migf->list_lock);
return 0;
}
static int
virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
u32 ctx_size)
{
struct virtiovf_data_buffer *header_buf;
struct virtiovf_data_buffer *buf;
bool unmark_end = false;
struct scatterlist *sg;
unsigned int i;
u32 res_size;
int nent;
int ret;
buf = virtiovf_get_data_buffer(migf, ctx_size);
if (IS_ERR(buf))
return PTR_ERR(buf);
/* Find the total count of SG entries which satisfies the size */
nent = sg_nents_for_len(buf->table.sgt.sgl, ctx_size);
if (nent <= 0) {
ret = -EINVAL;
goto out;
}
/*
* Iterate to that SG entry and mark it as last (if it's not already)
* to let underlay layers iterate only till that entry.
*/
for_each_sg(buf->table.sgt.sgl, sg, nent - 1, i)
;
if (!sg_is_last(sg)) {
unmark_end = true;
sg_mark_end(sg);
}
ret = virtio_pci_admin_dev_parts_get(migf->virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS,
migf->obj_id,
VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL,
buf->table.sgt.sgl, &res_size);
/* Restore the original SG mark end */
if (unmark_end)
sg_unmark_end(sg);
if (ret)
goto out;
buf->length = res_size;
header_buf = virtiovf_get_data_buffer(migf,
sizeof(struct virtiovf_migration_header));
if (IS_ERR(header_buf)) {
ret = PTR_ERR(header_buf);
goto out;
}
ret = virtiovf_add_buf_header(header_buf, res_size);
if (ret)
goto out_header;
buf->start_pos = buf->migf->max_pos;
migf->max_pos += buf->length;
spin_lock(&migf->list_lock);
list_add_tail(&buf->buf_elm, &migf->buf_list);
spin_unlock_irq(&migf->list_lock);
return 0;
out_header:
virtiovf_put_data_buffer(header_buf);
out:
virtiovf_put_data_buffer(buf);
return ret;
}
static int
virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev)
{
struct virtiovf_migration_file *migf = virtvdev->saving_migf;
u32 ctx_size;
int ret;
if (migf->state == VIRTIOVF_MIGF_STATE_ERROR)
return -ENODEV;
ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
&ctx_size);
if (ret)
goto err;
if (!ctx_size) {
ret = -EINVAL;
goto err;
}
ret = virtiovf_read_device_context_chunk(migf, ctx_size);
if (ret)
goto err;
migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
return 0;
err:
migf->state = VIRTIOVF_MIGF_STATE_ERROR;
return ret;
}
static struct virtiovf_migration_file *
virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev,
bool pre_copy)
{
struct virtiovf_migration_file *migf;
u32 ctx_size;
u32 obj_id;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
if (!migf)
return ERR_PTR(-ENOMEM);
migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_save_fops, migf,
O_RDONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
kfree(migf);
return ERR_PTR(ret);
}
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
migf->virtvdev = virtvdev;
lockdep_assert_held(&virtvdev->state_mutex);
ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
&obj_id);
if (ret)
goto out;
migf->obj_id = obj_id;
/* Mark as having a valid obj id which can be even 0 */
migf->has_obj_id = true;
ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
&ctx_size);
if (ret)
goto out_clean;
if (!ctx_size) {
ret = -EINVAL;
goto out_clean;
}
ret = virtiovf_read_device_context_chunk(migf, ctx_size);
if (ret)
goto out_clean;
if (pre_copy) {
migf->pre_copy_initial_bytes = migf->max_pos;
/* Arbitrarily set the pre-copy rate limit to 1-second intervals */
ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);
/* Prevent any rate messages upon its usage */
ratelimit_set_flags(&migf->pre_copy_rl_state,
RATELIMIT_MSG_ON_RELEASE);
migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
} else {
migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
}
return migf;
out_clean:
virtiovf_clean_migf_resources(migf);
out:
fput(migf->filp);
return ERR_PTR(ret);
}
/*
* Set the required object header at the beginning of the buffer.
* The actual device parts data will be written post of the header offset.
*/
static int virtiovf_set_obj_cmd_header(struct virtiovf_data_buffer *vhca_buf)
{
struct virtio_admin_cmd_resource_obj_cmd_hdr obj_hdr = {};
struct page *page;
u8 *to_buff;
obj_hdr.type = cpu_to_le16(VIRTIO_RESOURCE_OBJ_DEV_PARTS);
obj_hdr.id = cpu_to_le32(vhca_buf->migf->obj_id);
page = virtiovf_get_migration_page(vhca_buf, 0);
if (!page)
return -EINVAL;
to_buff = kmap_local_page(page);
memcpy(to_buff, &obj_hdr, sizeof(obj_hdr));
kunmap_local(to_buff);
/* Mark the buffer as including the header object data */
vhca_buf->include_header_object = 1;
return 0;
}
static int
virtiovf_append_page_to_mig_buf(struct virtiovf_data_buffer *vhca_buf,
const char __user **buf, size_t *len,
loff_t *pos, ssize_t *done)
{
unsigned long offset;
size_t page_offset;
struct page *page;
size_t page_len;
u8 *to_buff;
int ret;
offset = *pos - vhca_buf->start_pos;
if (vhca_buf->include_header_object)
/* The buffer holds the object header, update the offset accordingly */
offset += sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
page_offset = offset % PAGE_SIZE;
page = virtiovf_get_migration_page(vhca_buf, offset - page_offset);
if (!page)
return -EINVAL;
page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
to_buff = kmap_local_page(page);
ret = copy_from_user(to_buff + page_offset, *buf, page_len);
kunmap_local(to_buff);
if (ret)
return -EFAULT;
*pos += page_len;
*done += page_len;
*buf += page_len;
*len -= page_len;
vhca_buf->length += page_len;
return 0;
}
static ssize_t
virtiovf_resume_read_chunk(struct virtiovf_migration_file *migf,
struct virtiovf_data_buffer *vhca_buf,
size_t chunk_size, const char __user **buf,
size_t *len, loff_t *pos, ssize_t *done,
bool *has_work)
{
size_t copy_len, to_copy;
int ret;
to_copy = min_t(size_t, *len, chunk_size - vhca_buf->length);
copy_len = to_copy;
while (to_copy) {
ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
pos, done);
if (ret)
return ret;
}
*len -= copy_len;
if (vhca_buf->length == chunk_size) {
migf->load_state = VIRTIOVF_LOAD_STATE_LOAD_CHUNK;
migf->max_pos += chunk_size;
*has_work = true;
}
return 0;
}
static int
virtiovf_resume_read_header_data(struct virtiovf_migration_file *migf,
struct virtiovf_data_buffer *vhca_buf,
const char __user **buf, size_t *len,
loff_t *pos, ssize_t *done)
{
size_t copy_len, to_copy;
size_t required_data;
int ret;
required_data = migf->record_size - vhca_buf->length;
to_copy = min_t(size_t, *len, required_data);
copy_len = to_copy;
while (to_copy) {
ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
pos, done);
if (ret)
return ret;
}
*len -= copy_len;
if (vhca_buf->length == migf->record_size) {
switch (migf->record_tag) {
default:
/* Optional tag */
break;
}
migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
migf->max_pos += migf->record_size;
vhca_buf->length = 0;
}
return 0;
}
static int
virtiovf_resume_read_header(struct virtiovf_migration_file *migf,
struct virtiovf_data_buffer *vhca_buf,
const char __user **buf,
size_t *len, loff_t *pos,
ssize_t *done, bool *has_work)
{
struct page *page;
size_t copy_len;
u8 *to_buff;
int ret;
copy_len = min_t(size_t, *len,
sizeof(struct virtiovf_migration_header) - vhca_buf->length);
page = virtiovf_get_migration_page(vhca_buf, 0);
if (!page)
return -EINVAL;
to_buff = kmap_local_page(page);
ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
if (ret) {
ret = -EFAULT;
goto end;
}
*buf += copy_len;
*pos += copy_len;
*done += copy_len;
*len -= copy_len;
vhca_buf->length += copy_len;
if (vhca_buf->length == sizeof(struct virtiovf_migration_header)) {
u64 record_size;
u32 flags;
record_size = le64_to_cpup((__le64 *)to_buff);
if (record_size > MAX_LOAD_SIZE) {
ret = -ENOMEM;
goto end;
}
migf->record_size = record_size;
flags = le32_to_cpup((__le32 *)(to_buff +
offsetof(struct virtiovf_migration_header, flags)));
migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
offsetof(struct virtiovf_migration_header, tag)));
switch (migf->record_tag) {
case VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA:
migf->load_state = VIRTIOVF_LOAD_STATE_PREP_CHUNK;
break;
default:
if (!(flags & VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
ret = -EOPNOTSUPP;
goto end;
}
/* We may read and skip this optional record data */
migf->load_state = VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA;
}
migf->max_pos += vhca_buf->length;
vhca_buf->length = 0;
*has_work = true;
}
end:
kunmap_local(to_buff);
return ret;
}
static ssize_t virtiovf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct virtiovf_migration_file *migf = filp->private_data;
struct virtiovf_data_buffer *vhca_buf = migf->buf;
struct virtiovf_data_buffer *vhca_buf_header = migf->buf_header;
unsigned int orig_length;
bool has_work = false;
ssize_t done = 0;
int ret = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
if (*pos < vhca_buf->start_pos)
return -EINVAL;
mutex_lock(&migf->virtvdev->state_mutex);
mutex_lock(&migf->lock);
if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
done = -ENODEV;
goto out_unlock;
}
while (len || has_work) {
has_work = false;
switch (migf->load_state) {
case VIRTIOVF_LOAD_STATE_READ_HEADER:
ret = virtiovf_resume_read_header(migf, vhca_buf_header, &buf,
&len, pos, &done, &has_work);
if (ret)
goto out_unlock;
break;
case VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA:
if (vhca_buf_header->allocated_length < migf->record_size) {
virtiovf_free_data_buffer(vhca_buf_header);
migf->buf_header = virtiovf_alloc_data_buffer(migf,
migf->record_size);
if (IS_ERR(migf->buf_header)) {
ret = PTR_ERR(migf->buf_header);
migf->buf_header = NULL;
goto out_unlock;
}
vhca_buf_header = migf->buf_header;
}
vhca_buf_header->start_pos = migf->max_pos;
migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER_DATA;
break;
case VIRTIOVF_LOAD_STATE_READ_HEADER_DATA:
ret = virtiovf_resume_read_header_data(migf, vhca_buf_header,
&buf, &len, pos, &done);
if (ret)
goto out_unlock;
break;
case VIRTIOVF_LOAD_STATE_PREP_CHUNK:
{
u32 cmd_size = migf->record_size +
sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
/*
* The DMA map/unmap is managed in virtio layer, we just need to extend
* the SG pages to hold the extra required chunk data.
*/
if (vhca_buf->allocated_length < cmd_size) {
ret = virtiovf_add_migration_pages(vhca_buf,
DIV_ROUND_UP_ULL(cmd_size - vhca_buf->allocated_length,
PAGE_SIZE));
if (ret)
goto out_unlock;
}
vhca_buf->start_pos = migf->max_pos;
migf->load_state = VIRTIOVF_LOAD_STATE_READ_CHUNK;
break;
}
case VIRTIOVF_LOAD_STATE_READ_CHUNK:
ret = virtiovf_resume_read_chunk(migf, vhca_buf, migf->record_size,
&buf, &len, pos, &done, &has_work);
if (ret)
goto out_unlock;
break;
case VIRTIOVF_LOAD_STATE_LOAD_CHUNK:
/* Mark the last SG entry and set its length */
sg_mark_end(vhca_buf->last_offset_sg);
orig_length = vhca_buf->last_offset_sg->length;
/* Length should include the resource object command header */
vhca_buf->last_offset_sg->length = vhca_buf->length +
sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr) -
vhca_buf->last_offset;
ret = virtio_pci_admin_dev_parts_set(migf->virtvdev->core_device.pdev,
vhca_buf->table.sgt.sgl);
/* Restore the original SG data */
vhca_buf->last_offset_sg->length = orig_length;
sg_unmark_end(vhca_buf->last_offset_sg);
if (ret)
goto out_unlock;
migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
/* be ready for reading the next chunk */
vhca_buf->length = 0;
break;
default:
break;
}
}
out_unlock:
if (ret)
migf->state = VIRTIOVF_MIGF_STATE_ERROR;
mutex_unlock(&migf->lock);
virtiovf_state_mutex_unlock(migf->virtvdev);
return ret ? ret : done;
}
static const struct file_operations virtiovf_resume_fops = {
.owner = THIS_MODULE,
.write = virtiovf_resume_write,
.release = virtiovf_release_file,
};
static struct virtiovf_migration_file *
virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device *virtvdev)
{
struct virtiovf_migration_file *migf;
struct virtiovf_data_buffer *buf;
u32 obj_id;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
if (!migf)
return ERR_PTR(-ENOMEM);
migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_resume_fops, migf,
O_WRONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
kfree(migf);
return ERR_PTR(ret);
}
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
buf = virtiovf_alloc_data_buffer(migf, VIRTIOVF_TARGET_INITIAL_BUF_SIZE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out;
}
migf->buf = buf;
buf = virtiovf_alloc_data_buffer(migf,
sizeof(struct virtiovf_migration_header));
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_clean;
}
migf->buf_header = buf;
migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
migf->virtvdev = virtvdev;
ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET,
&obj_id);
if (ret)
goto out_clean;
migf->obj_id = obj_id;
/* Mark as having a valid obj id which can be even 0 */
migf->has_obj_id = true;
ret = virtiovf_set_obj_cmd_header(migf->buf);
if (ret)
goto out_clean;
return migf;
out_clean:
virtiovf_clean_migf_resources(migf);
out:
fput(migf->filp);
return ERR_PTR(ret);
}
static struct file *
virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
u32 new)
{
u32 cur = virtvdev->mig_state;
int ret;
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
/* NOP */
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
/* NOP */
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev,
BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED));
if (ret)
return ERR_PTR(ret);
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0);
if (ret)
return ERR_PTR(ret);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct virtiovf_migration_file *migf;
migf = virtiovf_pci_save_device_data(virtvdev, false);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
virtvdev->saving_migf = migf;
return migf->filp;
}
if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
virtiovf_disable_fds(virtvdev);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
struct virtiovf_migration_file *migf;
migf = virtiovf_pci_resume_device_data(virtvdev);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
virtvdev->resuming_migf = migf;
return migf->filp;
}
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
virtiovf_disable_fds(virtvdev);
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
struct virtiovf_migration_file *migf;
migf = virtiovf_pci_save_device_data(virtvdev, true);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
virtvdev->saving_migf = migf;
return migf->filp;
}
if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
ret = virtiovf_pci_save_device_final_data(virtvdev);
return ret ? ERR_PTR(ret) : NULL;
}
/*
* vfio_mig_get_next_state() does not use arcs other than the above
*/
WARN_ON(true);
return ERR_PTR(-EINVAL);
}
static struct file *
virtiovf_pci_set_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state new_state)
{
struct virtiovf_pci_core_device *virtvdev = container_of(
vdev, struct virtiovf_pci_core_device, core_device.vdev);
enum vfio_device_mig_state next_state;
struct file *res = NULL;
int ret;
mutex_lock(&virtvdev->state_mutex);
while (new_state != virtvdev->mig_state) {
ret = vfio_mig_get_next_state(vdev, virtvdev->mig_state,
new_state, &next_state);
if (ret) {
res = ERR_PTR(ret);
break;
}
res = virtiovf_pci_step_device_state_locked(virtvdev, next_state);
if (IS_ERR(res))
break;
virtvdev->mig_state = next_state;
if (WARN_ON(res && new_state != virtvdev->mig_state)) {
fput(res);
res = ERR_PTR(-EINVAL);
break;
}
}
virtiovf_state_mutex_unlock(virtvdev);
return res;
}
static int virtiovf_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
struct virtiovf_pci_core_device *virtvdev = container_of(
vdev, struct virtiovf_pci_core_device, core_device.vdev);
mutex_lock(&virtvdev->state_mutex);
*curr_state = virtvdev->mig_state;
virtiovf_state_mutex_unlock(virtvdev);
return 0;
}
static int virtiovf_pci_get_data_size(struct vfio_device *vdev,
unsigned long *stop_copy_length)
{
struct virtiovf_pci_core_device *virtvdev = container_of(
vdev, struct virtiovf_pci_core_device, core_device.vdev);
bool obj_id_exists;
u32 res_size;
u32 obj_id;
int ret;
mutex_lock(&virtvdev->state_mutex);
obj_id_exists = virtvdev->saving_migf && virtvdev->saving_migf->has_obj_id;
if (!obj_id_exists) {
ret = virtiovf_pci_alloc_obj_id(virtvdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
&obj_id);
if (ret)
goto end;
} else {
obj_id = virtvdev->saving_migf->obj_id;
}
ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
&res_size);
if (!ret)
*stop_copy_length = res_size;
/*
* We can't leave this obj_id alive if didn't exist before, otherwise, it might
* stay alive, even without an active migration flow (e.g. migration was cancelled)
*/
if (!obj_id_exists)
virtiovf_pci_free_obj_id(virtvdev, obj_id);
end:
virtiovf_state_mutex_unlock(virtvdev);
return ret;
}
static const struct vfio_migration_ops virtvdev_pci_mig_ops = {
.migration_set_state = virtiovf_pci_set_device_state,
.migration_get_state = virtiovf_pci_get_device_state,
.migration_get_data_size = virtiovf_pci_get_data_size,
};
void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev)
{
virtvdev->migrate_cap = 1;
mutex_init(&virtvdev->state_mutex);
spin_lock_init(&virtvdev->reset_lock);
virtvdev->core_device.vdev.migration_flags =
VFIO_MIGRATION_STOP_COPY |
VFIO_MIGRATION_P2P |
VFIO_MIGRATION_PRE_COPY;
virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops;
}
void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev)
{
if (!virtvdev->migrate_cap)
return;
virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
}
void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev)
{
if (!virtvdev->migrate_cap)
return;
virtiovf_disable_fds(virtvdev);
}