linux/drivers/vfio/pci/mlx5/main.c

1458 lines
37 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#include <linux/device.h>
#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/sched/mm.h>
#include <linux/anon_inodes.h>
#include "cmd.h"
/* Device specification max LOAD size */
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
#define MAX_CHUNK_SIZE SZ_8M
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
{
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
return container_of(core_device, struct mlx5vf_pci_core_device,
core_device);
}
struct page *
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
unsigned long offset)
{
unsigned long cur_offset = 0;
struct scatterlist *sg;
unsigned int i;
/* All accesses are sequential */
if (offset < buf->last_offset || !buf->last_offset_sg) {
buf->last_offset = 0;
buf->last_offset_sg = buf->table.sgt.sgl;
buf->sg_last_entry = 0;
}
cur_offset = buf->last_offset;
for_each_sg(buf->last_offset_sg, sg,
buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
if (offset < sg->length + cur_offset) {
buf->last_offset_sg = sg;
buf->sg_last_entry += i;
buf->last_offset = cur_offset;
return nth_page(sg_page(sg),
(offset - cur_offset) / PAGE_SIZE);
}
cur_offset += sg->length;
}
return NULL;
}
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
mutex_lock(&migf->lock);
migf->state = MLX5_MIGF_STATE_ERROR;
migf->filp->f_pos = 0;
mutex_unlock(&migf->lock);
}
static int mlx5vf_release_file(struct inode *inode, struct file *filp)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
mlx5vf_disable_fd(migf);
mutex_destroy(&migf->lock);
kfree(migf);
return 0;
}
static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
bool *end_of_data)
{
struct mlx5_vhca_data_buffer *buf;
bool found = false;
*end_of_data = false;
spin_lock_irq(&migf->list_lock);
if (list_empty(&migf->buf_list)) {
*end_of_data = true;
goto end;
}
buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
buf_elm);
if (pos >= buf->start_pos &&
pos < buf->start_pos + buf->length) {
found = true;
goto end;
}
/*
* As we use a stream based FD we may expect having the data always
* on first chunk
*/
migf->state = MLX5_MIGF_STATE_ERROR;
end:
spin_unlock_irq(&migf->list_lock);
return found ? buf : NULL;
}
static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
{
struct mlx5_vf_migration_file *migf = vhca_buf->migf;
if (vhca_buf->stop_copy_chunk_num) {
bool is_header = vhca_buf->dma_dir == DMA_NONE;
u8 chunk_num = vhca_buf->stop_copy_chunk_num;
size_t next_required_umem_size = 0;
if (is_header)
migf->buf_header[chunk_num - 1] = vhca_buf;
else
migf->buf[chunk_num - 1] = vhca_buf;
spin_lock_irq(&migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
if (!is_header) {
next_required_umem_size =
migf->next_required_umem_size;
migf->next_required_umem_size = 0;
migf->num_ready_chunks--;
}
spin_unlock_irq(&migf->list_lock);
if (next_required_umem_size)
mlx5vf_mig_file_set_save_work(migf, chunk_num,
next_required_umem_size);
return;
}
spin_lock_irq(&migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
spin_unlock_irq(&migf->list_lock);
}
static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
char __user **buf, size_t *len, loff_t *pos)
{
unsigned long offset;
ssize_t done = 0;
size_t copy_len;
copy_len = min_t(size_t,
vhca_buf->start_pos + vhca_buf->length - *pos, *len);
while (copy_len) {
size_t page_offset;
struct page *page;
size_t page_len;
u8 *from_buff;
int ret;
offset = *pos - vhca_buf->start_pos;
page_offset = offset % PAGE_SIZE;
offset -= page_offset;
page = mlx5vf_get_migration_page(vhca_buf, offset);
if (!page)
return -EINVAL;
page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
from_buff = kmap_local_page(page);
ret = copy_to_user(*buf, from_buff + page_offset, page_len);
kunmap_local(from_buff);
if (ret)
return -EFAULT;
*pos += page_len;
*len -= page_len;
*buf += page_len;
done += page_len;
copy_len -= page_len;
}
if (*pos >= vhca_buf->start_pos + vhca_buf->length)
mlx5vf_buf_read_done(vhca_buf);
return done;
}
static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
struct mlx5_vhca_data_buffer *vhca_buf;
bool first_loop_call = true;
bool end_of_data;
ssize_t done = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
if (!(filp->f_flags & O_NONBLOCK)) {
if (wait_event_interruptible(migf->poll_wait,
!list_empty(&migf->buf_list) ||
migf->state == MLX5_MIGF_STATE_ERROR ||
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
migf->state == MLX5_MIGF_STATE_PRE_COPY ||
migf->state == MLX5_MIGF_STATE_COMPLETE))
return -ERESTARTSYS;
}
mutex_lock(&migf->lock);
if (migf->state == MLX5_MIGF_STATE_ERROR) {
done = -ENODEV;
goto out_unlock;
}
while (len) {
ssize_t count;
vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
&end_of_data);
if (first_loop_call) {
first_loop_call = false;
/* Temporary end of file as part of PRE_COPY */
if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
done = -ENOMSG;
goto out_unlock;
}
if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
if (filp->f_flags & O_NONBLOCK) {
done = -EAGAIN;
goto out_unlock;
}
}
}
if (end_of_data)
goto out_unlock;
if (!vhca_buf) {
done = -EINVAL;
goto out_unlock;
}
count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
if (count < 0) {
done = count;
goto out_unlock;
}
done += count;
}
out_unlock:
mutex_unlock(&migf->lock);
return done;
}
static __poll_t mlx5vf_save_poll(struct file *filp,
struct poll_table_struct *wait)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
__poll_t pollflags = 0;
poll_wait(filp, &migf->poll_wait, wait);
mutex_lock(&migf->lock);
if (migf->state == MLX5_MIGF_STATE_ERROR)
pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
else if (!list_empty(&migf->buf_list) ||
migf->state == MLX5_MIGF_STATE_COMPLETE)
pollflags = EPOLLIN | EPOLLRDNORM;
mutex_unlock(&migf->lock);
return pollflags;
}
/*
* FD is exposed and user can use it after receiving an error.
* Mark migf in error, and wake the user.
*/
static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
{
migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
u8 chunk_num, size_t next_required_umem_size)
{
migf->save_data[chunk_num - 1].next_required_umem_size =
next_required_umem_size;
migf->save_data[chunk_num - 1].migf = migf;
get_file(migf->filp);
queue_work(migf->mvdev->cb_wq,
&migf->save_data[chunk_num - 1].work);
}
static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
u8 index, size_t required_length)
{
struct mlx5_vhca_data_buffer *buf = migf->buf[index];
u8 chunk_num;
WARN_ON(!buf);
chunk_num = buf->stop_copy_chunk_num;
buf->migf->buf[index] = NULL;
/* Checking whether the pre-allocated buffer can fit */
if (buf->allocated_length >= required_length)
return buf;
mlx5vf_put_data_buffer(buf);
buf = mlx5vf_get_data_buffer(buf->migf, required_length,
DMA_FROM_DEVICE);
if (IS_ERR(buf))
return buf;
buf->stop_copy_chunk_num = chunk_num;
return buf;
}
static void mlx5vf_mig_file_save_work(struct work_struct *_work)
{
struct mlx5vf_save_work_data *save_data = container_of(_work,
struct mlx5vf_save_work_data, work);
struct mlx5_vf_migration_file *migf = save_data->migf;
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
struct mlx5_vhca_data_buffer *buf;
mutex_lock(&mvdev->state_mutex);
if (migf->state == MLX5_MIGF_STATE_ERROR)
goto end;
buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
save_data->chunk_num - 1,
save_data->next_required_umem_size);
if (IS_ERR(buf))
goto err;
if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
goto err_save;
goto end;
err_save:
mlx5vf_put_data_buffer(buf);
err:
mlx5vf_mark_err(migf);
end:
mlx5vf_state_mutex_unlock(mvdev);
fput(migf->filp);
}
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
bool track)
{
size_t size = sizeof(struct mlx5_vf_migration_header) +
sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
struct mlx5_vf_migration_tag_stop_copy_data data = {};
struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5_vf_migration_header header = {};
unsigned long flags;
struct page *page;
u8 *to_buff;
int ret;
header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
if (IS_ERR(header_buf))
return PTR_ERR(header_buf);
header.record_size = cpu_to_le64(sizeof(data));
header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
page = mlx5vf_get_migration_page(header_buf, 0);
if (!page) {
ret = -EINVAL;
goto err;
}
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data);
kunmap_local(to_buff);
header_buf->start_pos = header_buf->migf->max_pos;
migf->max_pos += header_buf->length;
spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
spin_unlock_irqrestore(&migf->list_lock, flags);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
if (track)
migf->pre_copy_initial_bytes = size;
return 0;
err:
mlx5vf_put_data_buffer(header_buf);
return ret;
}
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
struct mlx5_vf_migration_file *migf,
size_t state_size, u64 full_size,
bool track)
{
struct mlx5_vhca_data_buffer *buf;
size_t inc_state_size;
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
int num_chunks;
int ret;
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
int i;
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
if (mvdev->chunk_mode) {
size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
/* from firmware perspective at least 'state_size' buffer should be set */
inc_state_size = max(state_size, chunk_size);
} else {
if (track) {
/* let's be ready for stop_copy size that might grow by 10 percents */
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
inc_state_size = state_size;
} else {
inc_state_size = state_size;
}
}
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
/* let's not overflow the device specification max SAVE size */
inc_state_size = min_t(size_t, inc_state_size,
(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
for (i = 0; i < num_chunks; i++) {
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf[i] = buf;
buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf_header[i] = buf;
if (mvdev->chunk_mode) {
migf->buf[i]->stop_copy_chunk_num = i + 1;
migf->buf_header[i]->stop_copy_chunk_num = i + 1;
INIT_WORK(&migf->save_data[i].work,
mlx5vf_mig_file_save_work);
migf->save_data[i].chunk_num = i + 1;
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
}
}
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
ret = mlx5vf_add_stop_copy_header(migf, track);
if (ret)
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
goto err;
return 0;
err:
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
for (i = 0; i < num_chunks; i++) {
if (migf->buf[i]) {
mlx5vf_put_data_buffer(migf->buf[i]);
migf->buf[i] = NULL;
}
if (migf->buf_header[i]) {
mlx5vf_put_data_buffer(migf->buf_header[i]);
migf->buf_header[i] = NULL;
}
}
return ret;
}
static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
struct mlx5_vhca_data_buffer *buf;
struct vfio_precopy_info info = {};
loff_t *pos = &filp->f_pos;
unsigned long minsz;
size_t inc_length = 0;
bool end_of_data = false;
int ret;
if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
return -ENOTTY;
minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
mutex_lock(&mvdev->state_mutex);
if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
ret = -EINVAL;
goto err_state_unlock;
}
/*
* We can't issue a SAVE command when the device is suspended, so as
* part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
* bytes that can't be read.
*/
if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
/*
* Once the query returns it's guaranteed that there is no
* active SAVE command.
* As so, the other code below is safe with the proper locks.
*/
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
NULL, MLX5VF_QUERY_INC);
if (ret)
goto err_state_unlock;
}
mutex_lock(&migf->lock);
if (migf->state == MLX5_MIGF_STATE_ERROR) {
ret = -ENODEV;
goto err_migf_unlock;
}
if (migf->pre_copy_initial_bytes > *pos) {
info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
} else {
info.dirty_bytes = migf->max_pos - *pos;
if (!info.dirty_bytes)
end_of_data = true;
info.dirty_bytes += inc_length;
}
if (!end_of_data || !inc_length) {
mutex_unlock(&migf->lock);
goto done;
}
mutex_unlock(&migf->lock);
/*
* We finished transferring the current state and the device has a
* dirty state, save a new state to be ready for.
*/
buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
mlx5vf_mark_err(migf);
goto err_state_unlock;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
if (ret) {
mlx5vf_mark_err(migf);
mlx5vf_put_data_buffer(buf);
goto err_state_unlock;
}
done:
mlx5vf_state_mutex_unlock(mvdev);
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;
err_migf_unlock:
mutex_unlock(&migf->lock);
err_state_unlock:
mlx5vf_state_mutex_unlock(mvdev);
return ret;
}
static const struct file_operations mlx5vf_save_fops = {
.owner = THIS_MODULE,
.read = mlx5vf_save_read,
.poll = mlx5vf_save_poll,
.unlocked_ioctl = mlx5vf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = mlx5vf_release_file,
.llseek = no_llseek,
};
static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
{
struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
struct mlx5_vhca_data_buffer *buf;
size_t length;
int ret;
if (migf->state == MLX5_MIGF_STATE_ERROR)
return -ENODEV;
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
if (ret)
goto err;
buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
if (ret)
goto err_save;
return 0;
err_save:
mlx5vf_put_data_buffer(buf);
err:
mlx5vf_mark_err(migf);
return ret;
}
static struct mlx5_vf_migration_file *
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
{
struct mlx5_vf_migration_file *migf;
struct mlx5_vhca_data_buffer *buf;
size_t length;
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
u64 full_size;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
if (!migf)
return ERR_PTR(-ENOMEM);
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
O_RDONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
goto end;
}
migf->mvdev = mvdev;
ret = mlx5vf_cmd_alloc_pd(migf);
if (ret)
goto out_free;
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
init_waitqueue_head(&migf->poll_wait);
init_completion(&migf->save_comp);
/*
* save_comp is being used as a binary semaphore built from
* a completion. A normal mutex cannot be used because the lock is
* passed between kernel threads and lockdep can't model this.
*/
complete(&migf->save_comp);
mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
if (ret)
goto out_pd;
ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
if (ret)
goto out_pd;
if (track) {
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
/* leave the allocated buffer ready for the stop-copy phase */
buf = mlx5vf_alloc_data_buffer(migf,
migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd;
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
}
} else {
buf = migf->buf[0];
migf->buf[0] = NULL;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
if (ret)
goto out_save;
return migf;
out_save:
mlx5vf_free_data_buffer(buf);
out_pd:
mlx5fv_cmd_clean_migf_resources(migf);
out_free:
fput(migf->filp);
end:
kfree(migf);
return ERR_PTR(ret);
}
static int
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
const char __user **buf, size_t *len,
loff_t *pos, ssize_t *done)
{
unsigned long offset;
size_t page_offset;
struct page *page;
size_t page_len;
u8 *to_buff;
int ret;
offset = *pos - vhca_buf->start_pos;
page_offset = offset % PAGE_SIZE;
page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
if (!page)
return -EINVAL;
page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
to_buff = kmap_local_page(page);
ret = copy_from_user(to_buff + page_offset, *buf, page_len);
kunmap_local(to_buff);
if (ret)
return -EFAULT;
*pos += page_len;
*done += page_len;
*buf += page_len;
*len -= page_len;
vhca_buf->length += page_len;
return 0;
}
static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
struct mlx5_vhca_data_buffer *vhca_buf,
size_t image_size, const char __user **buf,
size_t *len, loff_t *pos, ssize_t *done,
bool *has_work)
{
size_t copy_len, to_copy;
int ret;
to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
copy_len = to_copy;
while (to_copy) {
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
done);
if (ret)
return ret;
}
*len -= copy_len;
if (vhca_buf->length == image_size) {
migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
migf->max_pos += image_size;
*has_work = true;
}
return 0;
}
static int
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
struct mlx5_vhca_data_buffer *vhca_buf,
const char __user **buf, size_t *len,
loff_t *pos, ssize_t *done)
{
size_t copy_len, to_copy;
size_t required_data;
u8 *to_buff;
int ret;
required_data = migf->record_size - vhca_buf->length;
to_copy = min_t(size_t, *len, required_data);
copy_len = to_copy;
while (to_copy) {
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
done);
if (ret)
return ret;
}
*len -= copy_len;
if (vhca_buf->length == migf->record_size) {
switch (migf->record_tag) {
case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
{
struct page *page;
page = mlx5vf_get_migration_page(vhca_buf, 0);
if (!page)
return -EINVAL;
to_buff = kmap_local_page(page);
migf->stop_copy_prep_size = min_t(u64,
le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
kunmap_local(to_buff);
break;
}
default:
/* Optional tag */
break;
}
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
migf->max_pos += migf->record_size;
vhca_buf->length = 0;
}
return 0;
}
static int
mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
struct mlx5_vhca_data_buffer *vhca_buf,
const char __user **buf,
size_t *len, loff_t *pos,
ssize_t *done, bool *has_work)
{
struct page *page;
size_t copy_len;
u8 *to_buff;
int ret;
copy_len = min_t(size_t, *len,
sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
page = mlx5vf_get_migration_page(vhca_buf, 0);
if (!page)
return -EINVAL;
to_buff = kmap_local_page(page);
ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
if (ret) {
ret = -EFAULT;
goto end;
}
*buf += copy_len;
*pos += copy_len;
*done += copy_len;
*len -= copy_len;
vhca_buf->length += copy_len;
if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
u64 record_size;
u32 flags;
record_size = le64_to_cpup((__le64 *)to_buff);
if (record_size > MAX_LOAD_SIZE) {
ret = -ENOMEM;
goto end;
}
migf->record_size = record_size;
flags = le32_to_cpup((__le32 *)(to_buff +
offsetof(struct mlx5_vf_migration_header, flags)));
migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
offsetof(struct mlx5_vf_migration_header, tag)));
switch (migf->record_tag) {
case MLX5_MIGF_HEADER_TAG_FW_DATA:
migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
break;
case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
break;
default:
if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
ret = -EOPNOTSUPP;
goto end;
}
/* We may read and skip this optional record data */
migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
}
migf->max_pos += vhca_buf->length;
vhca_buf->length = 0;
*has_work = true;
}
end:
kunmap_local(to_buff);
return ret;
}
static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
loff_t requested_length;
bool has_work = false;
ssize_t done = 0;
int ret = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
if (*pos < 0 ||
check_add_overflow((loff_t)len, *pos, &requested_length))
return -EINVAL;
mutex_lock(&migf->mvdev->state_mutex);
mutex_lock(&migf->lock);
if (migf->state == MLX5_MIGF_STATE_ERROR) {
ret = -ENODEV;
goto out_unlock;
}
while (len || has_work) {
has_work = false;
switch (migf->load_state) {
case MLX5_VF_LOAD_STATE_READ_HEADER:
ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
&buf, &len, pos,
&done, &has_work);
if (ret)
goto out_unlock;
break;
case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
if (vhca_buf_header->allocated_length < migf->record_size) {
mlx5vf_free_data_buffer(vhca_buf_header);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
migf->record_size, DMA_NONE);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
if (IS_ERR(migf->buf_header[0])) {
ret = PTR_ERR(migf->buf_header[0]);
migf->buf_header[0] = NULL;
goto out_unlock;
}
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
vhca_buf_header = migf->buf_header[0];
}
vhca_buf_header->start_pos = migf->max_pos;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
break;
case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
&buf, &len, pos, &done);
if (ret)
goto out_unlock;
break;
case MLX5_VF_LOAD_STATE_PREP_IMAGE:
{
u64 size = max(migf->record_size,
migf->stop_copy_prep_size);
if (vhca_buf->allocated_length < size) {
mlx5vf_free_data_buffer(vhca_buf);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
size, DMA_TO_DEVICE);
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
if (IS_ERR(migf->buf[0])) {
ret = PTR_ERR(migf->buf[0]);
migf->buf[0] = NULL;
goto out_unlock;
}
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
vhca_buf = migf->buf[0];
}
vhca_buf->start_pos = migf->max_pos;
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
break;
}
case MLX5_VF_LOAD_STATE_READ_IMAGE:
ret = mlx5vf_resume_read_image(migf, vhca_buf,
migf->record_size,
&buf, &len, pos, &done, &has_work);
if (ret)
goto out_unlock;
break;
case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
if (ret)
goto out_unlock;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
/* prep header buf for next image */
vhca_buf_header->length = 0;
/* prep data buf for next image */
vhca_buf->length = 0;
break;
default:
break;
}
}
out_unlock:
if (ret)
migf->state = MLX5_MIGF_STATE_ERROR;
mutex_unlock(&migf->lock);
mlx5vf_state_mutex_unlock(migf->mvdev);
return ret ? ret : done;
}
static const struct file_operations mlx5vf_resume_fops = {
.owner = THIS_MODULE,
.write = mlx5vf_resume_write,
.release = mlx5vf_release_file,
.llseek = no_llseek,
};
static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
{
struct mlx5_vf_migration_file *migf;
struct mlx5_vhca_data_buffer *buf;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
if (!migf)
return ERR_PTR(-ENOMEM);
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
O_WRONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
goto end;
}
migf->mvdev = mvdev;
ret = mlx5vf_cmd_alloc_pd(migf);
if (ret)
goto out_free;
buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd;
}
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
migf->buf[0] = buf;
buf = mlx5vf_alloc_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_buf;
}
migf->buf_header[0] = buf;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
return migf;
out_buf:
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase This patch is another preparation step towards working in chunk mode. It pre-allocates chunks for the STOP_COPY phase to let the driver use them immediately and prevent an extra allocation upon that phase. Before that patch we had a single large buffer that was dedicated for the STOP_COPY phase as there was a single SAVE in the source for the last image. Once we'll move to chunk mode the idea is to have some small buffers that will be used upon the STOP_COPY phase. The driver will read-ahead from the firmware the full state in small/optimized chunks while letting QEMU/user space read in parallel the available data. Each buffer holds its chunk number to let it be recognized down the road in the coming patches. The chunk buffer size is picked-up based on the minimum size that firmware requires, the total full size and some max value in the driver code which was set to 8MB to achieve some optimized downtime in the general case. As the chunk mode is applicable even if we move directly to STOP_COPY the buffers preparation and some other related stuff is done unconditionally with regards to STOP/PRE-COPY. Note: In that phase in the series we still didn't activate the chunk mode and the first buffer will be used in all the places. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-09-11 12:38:53 +03:00
mlx5vf_free_data_buffer(migf->buf[0]);
out_pd:
mlx5vf_cmd_dealloc_pd(migf);
out_free:
fput(migf->filp);
end:
kfree(migf);
return ERR_PTR(ret);
}
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
enum mlx5_vf_migf_state *last_save_state)
{
if (mvdev->resuming_migf) {
mlx5vf_disable_fd(mvdev->resuming_migf);
mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
fput(mvdev->resuming_migf->filp);
mvdev->resuming_migf = NULL;
}
if (mvdev->saving_migf) {
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
if (last_save_state)
*last_save_state = mvdev->saving_migf->state;
mlx5vf_disable_fd(mvdev->saving_migf);
wake_up_interruptible(&mvdev->saving_migf->poll_wait);
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
fput(mvdev->saving_migf->filp);
mvdev->saving_migf = NULL;
}
}
static struct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
u32 new)
{
u32 cur = mvdev->mig_state;
int ret;
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
if (ret)
return ERR_PTR(ret);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
ret = mlx5vf_cmd_resume_vhca(mvdev,
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
if (ret)
return ERR_PTR(ret);
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
if (ret)
return ERR_PTR(ret);
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
ret = mlx5vf_cmd_resume_vhca(mvdev,
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
if (ret)
return ERR_PTR(ret);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct mlx5_vf_migration_file *migf;
migf = mlx5vf_pci_save_device_data(mvdev, false);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
mvdev->saving_migf = migf;
return migf->filp;
}
if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
mlx5vf_disable_fds(mvdev, NULL);
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
struct mlx5_vhca_data_buffer *buf;
enum mlx5_vf_migf_state state;
size_t size;
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
if (ret)
return ERR_PTR(ret);
buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
if (IS_ERR(buf))
return ERR_CAST(buf);
/* pre_copy cleanup */
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
if (ret) {
mlx5vf_put_data_buffer(buf);
return ERR_PTR(ret);
}
mlx5vf_disable_fds(mvdev, &state);
return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
struct mlx5_vf_migration_file *migf;
migf = mlx5vf_pci_resume_device_data(mvdev);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
mvdev->resuming_migf = migf;
return migf->filp;
}
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
mlx5vf_disable_fds(mvdev, NULL);
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
struct mlx5_vf_migration_file *migf;
migf = mlx5vf_pci_save_device_data(mvdev, true);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
mvdev->saving_migf = migf;
return migf->filp;
}
if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
if (ret)
return ERR_PTR(ret);
ret = mlx5vf_pci_save_device_inc_data(mvdev);
return ret ? ERR_PTR(ret) : NULL;
}
/*
* vfio_mig_get_next_state() does not use arcs other than the above
*/
WARN_ON(true);
return ERR_PTR(-EINVAL);
}
/*
* This function is called in all state_mutex unlock cases to
* handle a 'deferred_reset' if exists.
*/
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
{
again:
spin_lock(&mvdev->reset_lock);
if (mvdev->deferred_reset) {
mvdev->deferred_reset = false;
spin_unlock(&mvdev->reset_lock);
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
mlx5vf_disable_fds(mvdev, NULL);
goto again;
}
mutex_unlock(&mvdev->state_mutex);
spin_unlock(&mvdev->reset_lock);
}
static struct file *
mlx5vf_pci_set_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state new_state)
{
struct mlx5vf_pci_core_device *mvdev = container_of(
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
enum vfio_device_mig_state next_state;
struct file *res = NULL;
int ret;
mutex_lock(&mvdev->state_mutex);
while (new_state != mvdev->mig_state) {
ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
new_state, &next_state);
if (ret) {
res = ERR_PTR(ret);
break;
}
res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
if (IS_ERR(res))
break;
mvdev->mig_state = next_state;
if (WARN_ON(res && new_state != mvdev->mig_state)) {
fput(res);
res = ERR_PTR(-EINVAL);
break;
}
}
mlx5vf_state_mutex_unlock(mvdev);
return res;
}
static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
unsigned long *stop_copy_length)
{
struct mlx5vf_pci_core_device *mvdev = container_of(
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
size_t state_size;
u64 total_size;
int ret;
mutex_lock(&mvdev->state_mutex);
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
&total_size, 0);
if (!ret)
*stop_copy_length = total_size;
mlx5vf_state_mutex_unlock(mvdev);
return ret;
}
static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
struct mlx5vf_pci_core_device *mvdev = container_of(
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
mutex_lock(&mvdev->state_mutex);
*curr_state = mvdev->mig_state;
mlx5vf_state_mutex_unlock(mvdev);
return 0;
}
static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
{
struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
if (!mvdev->migrate_cap)
return;
/*
* As the higher VFIO layers are holding locks across reset and using
* those same locks with the mm_lock we need to prevent ABBA deadlock
* with the state_mutex and mm_lock.
* In case the state_mutex was taken already we defer the cleanup work
* to the unlock flow of the other running context.
*/
spin_lock(&mvdev->reset_lock);
mvdev->deferred_reset = true;
if (!mutex_trylock(&mvdev->state_mutex)) {
spin_unlock(&mvdev->reset_lock);
return;
}
spin_unlock(&mvdev->reset_lock);
mlx5vf_state_mutex_unlock(mvdev);
}
static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
{
struct mlx5vf_pci_core_device *mvdev = container_of(
core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
struct vfio_pci_core_device *vdev = &mvdev->core_device;
int ret;
ret = vfio_pci_core_enable(vdev);
if (ret)
return ret;
if (mvdev->migrate_cap)
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
vfio_pci_core_finish_enable(vdev);
return 0;
}
static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
{
struct mlx5vf_pci_core_device *mvdev = container_of(
core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
mlx5vf_cmd_close_migratable(mvdev);
vfio_pci_core_close_device(core_vdev);
}
static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
.migration_set_state = mlx5vf_pci_set_device_state,
.migration_get_state = mlx5vf_pci_get_device_state,
.migration_get_data_size = mlx5vf_pci_get_data_size,
};
static const struct vfio_log_ops mlx5vf_pci_log_ops = {
.log_start = mlx5vf_start_page_tracker,
.log_stop = mlx5vf_stop_page_tracker,
.log_read_and_clear = mlx5vf_tracker_read_and_clear,
};
static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
{
struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
struct mlx5vf_pci_core_device, core_device.vdev);
int ret;
ret = vfio_pci_core_init_dev(core_vdev);
if (ret)
return ret;
mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
&mlx5vf_pci_log_ops);
return 0;
}
static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
{
struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
struct mlx5vf_pci_core_device, core_device.vdev);
mlx5vf_cmd_remove_migratable(mvdev);
vfio_pci_core_release_dev(core_vdev);
}
static const struct vfio_device_ops mlx5vf_pci_ops = {
.name = "mlx5-vfio-pci",
.init = mlx5vf_pci_init_dev,
.release = mlx5vf_pci_release_dev,
.open_device = mlx5vf_pci_open_device,
.close_device = mlx5vf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int mlx5vf_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
struct mlx5vf_pci_core_device *mvdev;
int ret;
mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
&pdev->dev, &mlx5vf_pci_ops);
if (IS_ERR(mvdev))
return PTR_ERR(mvdev);
dev_set_drvdata(&pdev->dev, &mvdev->core_device);
ret = vfio_pci_core_register_device(&mvdev->core_device);
if (ret)
goto out_put_vdev;
return 0;
out_put_vdev:
vfio_put_device(&mvdev->core_device.vdev);
return ret;
}
static void mlx5vf_pci_remove(struct pci_dev *pdev)
{
struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
vfio_pci_core_unregister_device(&mvdev->core_device);
vfio_put_device(&mvdev->core_device.vdev);
}
static const struct pci_device_id mlx5vf_pci_table[] = {
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
{}
};
MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
static const struct pci_error_handlers mlx5vf_err_handlers = {
.reset_done = mlx5vf_pci_aer_reset_done,
.error_detected = vfio_pci_core_aer_err_detected,
};
static struct pci_driver mlx5vf_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = mlx5vf_pci_table,
.probe = mlx5vf_pci_probe,
.remove = mlx5vf_pci_remove,
.err_handler = &mlx5vf_err_handlers,
.driver_managed_dma = true,
};
module_pci_driver(mlx5vf_pci_driver);
MODULE_IMPORT_NS(IOMMUFD);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
MODULE_DESCRIPTION(
"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");