linux/drivers/vfio/pci/pds/lm.c

446 lines
10 KiB
C
Raw Permalink Normal View History

2023-08-07 13:57:52 -07:00
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/vfio.h>
#include <linux/vfio_pci_core.h>
#include "vfio_dev.h"
#include "cmds.h"
static struct pds_vfio_lm_file *
pds_vfio_get_lm_file(const struct file_operations *fops, int flags, u64 size)
{
struct pds_vfio_lm_file *lm_file = NULL;
unsigned long long npages;
struct page **pages;
void *page_mem;
const void *p;
if (!size)
return NULL;
/* Alloc file structure */
lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
if (!lm_file)
return NULL;
/* Create file */
lm_file->filep =
anon_inode_getfile("pds_vfio_lm", fops, lm_file, flags);
if (IS_ERR(lm_file->filep))
2023-08-07 13:57:52 -07:00
goto out_free_file;
stream_open(lm_file->filep->f_inode, lm_file->filep);
mutex_init(&lm_file->lock);
/* prevent file from being released before we are done with it */
get_file(lm_file->filep);
/* Allocate memory for file pages */
npages = DIV_ROUND_UP_ULL(size, PAGE_SIZE);
pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
if (!pages)
goto out_put_file;
page_mem = kvzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
if (!page_mem)
goto out_free_pages_array;
p = page_mem - offset_in_page(page_mem);
for (unsigned long long i = 0; i < npages; i++) {
if (is_vmalloc_addr(p))
pages[i] = vmalloc_to_page(p);
else
pages[i] = kmap_to_page((void *)p);
if (!pages[i])
goto out_free_page_mem;
p += PAGE_SIZE;
}
/* Create scatterlist of file pages to use for DMA mapping later */
if (sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, 0,
size, GFP_KERNEL))
goto out_free_page_mem;
lm_file->size = size;
lm_file->pages = pages;
lm_file->npages = npages;
lm_file->page_mem = page_mem;
lm_file->alloc_size = npages * PAGE_SIZE;
return lm_file;
out_free_page_mem:
kvfree(page_mem);
out_free_pages_array:
kfree(pages);
out_put_file:
fput(lm_file->filep);
mutex_destroy(&lm_file->lock);
out_free_file:
kfree(lm_file);
return NULL;
}
static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
{
mutex_lock(&lm_file->lock);
vfio/pds: Make sure migration file isn't accessed after reset It's possible the migration file is accessed after reset when it has been cleaned up, especially when it's initiated by the device. This is because the driver doesn't rip out the filep when cleaning up it only frees the related page structures and sets its local struct pds_vfio_lm_file pointer to NULL. This can cause a NULL pointer dereference, which is shown in the example below during a restore after a device initiated reset: BUG: kernel NULL pointer dereference, address: 000000000000000c PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:pds_vfio_get_file_page+0x5d/0xf0 [pds_vfio_pci] [...] Call Trace: <TASK> pds_vfio_restore_write+0xf6/0x160 [pds_vfio_pci] vfs_write+0xc9/0x3f0 ? __fget_light+0xc9/0x110 ksys_write+0xb5/0xf0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] Add a disabled flag to the driver's struct pds_vfio_lm_file that gets set during cleanup. Then make sure to check the flag when the migration file is accessed via its file_operations. By default this flag will be false as the memory for struct pds_vfio_lm_file is kzalloc'd, which means the struct pds_vfio_lm_file is enabled and accessible. Also, since the file_operations and driver's migration file cleanup happen under the protection of the same pds_vfio_lm_file.lock, using this flag is thread safe. Fixes: 8512ed256334 ("vfio/pds: Always clear the save/restore FDs on reset") Reviewed-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Brett Creeley <brett.creeley@amd.com> Link: https://lore.kernel.org/r/20240308182149.22036-2-brett.creeley@amd.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2024-03-08 10:21:48 -08:00
lm_file->disabled = true;
2023-08-07 13:57:52 -07:00
lm_file->size = 0;
lm_file->alloc_size = 0;
vfio/pds: Make sure migration file isn't accessed after reset It's possible the migration file is accessed after reset when it has been cleaned up, especially when it's initiated by the device. This is because the driver doesn't rip out the filep when cleaning up it only frees the related page structures and sets its local struct pds_vfio_lm_file pointer to NULL. This can cause a NULL pointer dereference, which is shown in the example below during a restore after a device initiated reset: BUG: kernel NULL pointer dereference, address: 000000000000000c PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:pds_vfio_get_file_page+0x5d/0xf0 [pds_vfio_pci] [...] Call Trace: <TASK> pds_vfio_restore_write+0xf6/0x160 [pds_vfio_pci] vfs_write+0xc9/0x3f0 ? __fget_light+0xc9/0x110 ksys_write+0xb5/0xf0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] Add a disabled flag to the driver's struct pds_vfio_lm_file that gets set during cleanup. Then make sure to check the flag when the migration file is accessed via its file_operations. By default this flag will be false as the memory for struct pds_vfio_lm_file is kzalloc'd, which means the struct pds_vfio_lm_file is enabled and accessible. Also, since the file_operations and driver's migration file cleanup happen under the protection of the same pds_vfio_lm_file.lock, using this flag is thread safe. Fixes: 8512ed256334 ("vfio/pds: Always clear the save/restore FDs on reset") Reviewed-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Brett Creeley <brett.creeley@amd.com> Link: https://lore.kernel.org/r/20240308182149.22036-2-brett.creeley@amd.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2024-03-08 10:21:48 -08:00
lm_file->filep->f_pos = 0;
2023-08-07 13:57:52 -07:00
/* Free scatter list of file pages */
sg_free_table(&lm_file->sg_table);
kvfree(lm_file->page_mem);
lm_file->page_mem = NULL;
kfree(lm_file->pages);
lm_file->pages = NULL;
mutex_unlock(&lm_file->lock);
/* allow file to be released since we are done with it */
fput(lm_file->filep);
}
void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
{
if (!pds_vfio->save_file)
return;
pds_vfio_put_lm_file(pds_vfio->save_file);
pds_vfio->save_file = NULL;
}
void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
{
if (!pds_vfio->restore_file)
return;
pds_vfio_put_lm_file(pds_vfio->restore_file);
pds_vfio->restore_file = NULL;
}
static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
unsigned long offset)
{
unsigned long cur_offset = 0;
struct scatterlist *sg;
unsigned int i;
/* All accesses are sequential */
if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
lm_file->last_offset = 0;
lm_file->last_offset_sg = lm_file->sg_table.sgl;
lm_file->sg_last_entry = 0;
}
cur_offset = lm_file->last_offset;
for_each_sg(lm_file->last_offset_sg, sg,
lm_file->sg_table.orig_nents - lm_file->sg_last_entry, i) {
if (offset < sg->length + cur_offset) {
lm_file->last_offset_sg = sg;
lm_file->sg_last_entry += i;
lm_file->last_offset = cur_offset;
return nth_page(sg_page(sg),
(offset - cur_offset) / PAGE_SIZE);
}
cur_offset += sg->length;
}
return NULL;
}
static int pds_vfio_release_file(struct inode *inode, struct file *filp)
{
struct pds_vfio_lm_file *lm_file = filp->private_data;
mutex_lock(&lm_file->lock);
lm_file->filep->f_pos = 0;
lm_file->size = 0;
mutex_unlock(&lm_file->lock);
mutex_destroy(&lm_file->lock);
kfree(lm_file);
return 0;
}
static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf,
size_t len, loff_t *pos)
{
struct pds_vfio_lm_file *lm_file = filp->private_data;
ssize_t done = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
mutex_lock(&lm_file->lock);
vfio/pds: Make sure migration file isn't accessed after reset It's possible the migration file is accessed after reset when it has been cleaned up, especially when it's initiated by the device. This is because the driver doesn't rip out the filep when cleaning up it only frees the related page structures and sets its local struct pds_vfio_lm_file pointer to NULL. This can cause a NULL pointer dereference, which is shown in the example below during a restore after a device initiated reset: BUG: kernel NULL pointer dereference, address: 000000000000000c PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:pds_vfio_get_file_page+0x5d/0xf0 [pds_vfio_pci] [...] Call Trace: <TASK> pds_vfio_restore_write+0xf6/0x160 [pds_vfio_pci] vfs_write+0xc9/0x3f0 ? __fget_light+0xc9/0x110 ksys_write+0xb5/0xf0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] Add a disabled flag to the driver's struct pds_vfio_lm_file that gets set during cleanup. Then make sure to check the flag when the migration file is accessed via its file_operations. By default this flag will be false as the memory for struct pds_vfio_lm_file is kzalloc'd, which means the struct pds_vfio_lm_file is enabled and accessible. Also, since the file_operations and driver's migration file cleanup happen under the protection of the same pds_vfio_lm_file.lock, using this flag is thread safe. Fixes: 8512ed256334 ("vfio/pds: Always clear the save/restore FDs on reset") Reviewed-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Brett Creeley <brett.creeley@amd.com> Link: https://lore.kernel.org/r/20240308182149.22036-2-brett.creeley@amd.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2024-03-08 10:21:48 -08:00
if (lm_file->disabled) {
done = -ENODEV;
goto out_unlock;
}
2023-08-07 13:57:52 -07:00
if (*pos > lm_file->size) {
done = -EINVAL;
goto out_unlock;
}
len = min_t(size_t, lm_file->size - *pos, len);
while (len) {
size_t page_offset;
struct page *page;
size_t page_len;
u8 *from_buff;
int err;
page_offset = (*pos) % PAGE_SIZE;
page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
if (!page) {
if (done == 0)
done = -EINVAL;
goto out_unlock;
}
page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
from_buff = kmap_local_page(page);
err = copy_to_user(buf, from_buff + page_offset, page_len);
kunmap_local(from_buff);
if (err) {
done = -EFAULT;
goto out_unlock;
}
*pos += page_len;
len -= page_len;
done += page_len;
buf += page_len;
}
out_unlock:
mutex_unlock(&lm_file->lock);
return done;
}
static const struct file_operations pds_vfio_save_fops = {
.owner = THIS_MODULE,
.read = pds_vfio_save_read,
.release = pds_vfio_release_file,
};
static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
{
struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
struct pds_vfio_lm_file *lm_file;
u64 size;
int err;
/* Get live migration state size in this state */
err = pds_vfio_get_lm_state_size_cmd(pds_vfio, &size);
if (err) {
dev_err(dev, "failed to get save status: %pe\n", ERR_PTR(err));
return err;
}
dev_dbg(dev, "save status, size = %lld\n", size);
if (!size) {
dev_err(dev, "invalid state size\n");
return -EIO;
}
lm_file = pds_vfio_get_lm_file(&pds_vfio_save_fops, O_RDONLY, size);
if (!lm_file) {
dev_err(dev, "failed to create save file\n");
return -ENOENT;
}
dev_dbg(dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
lm_file->size, lm_file->alloc_size, lm_file->npages);
pds_vfio->save_file = lm_file;
return 0;
}
static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct pds_vfio_lm_file *lm_file = filp->private_data;
loff_t requested_length;
ssize_t done = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
if (*pos < 0 ||
check_add_overflow((loff_t)len, *pos, &requested_length))
return -EINVAL;
mutex_lock(&lm_file->lock);
vfio/pds: Make sure migration file isn't accessed after reset It's possible the migration file is accessed after reset when it has been cleaned up, especially when it's initiated by the device. This is because the driver doesn't rip out the filep when cleaning up it only frees the related page structures and sets its local struct pds_vfio_lm_file pointer to NULL. This can cause a NULL pointer dereference, which is shown in the example below during a restore after a device initiated reset: BUG: kernel NULL pointer dereference, address: 000000000000000c PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:pds_vfio_get_file_page+0x5d/0xf0 [pds_vfio_pci] [...] Call Trace: <TASK> pds_vfio_restore_write+0xf6/0x160 [pds_vfio_pci] vfs_write+0xc9/0x3f0 ? __fget_light+0xc9/0x110 ksys_write+0xb5/0xf0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] Add a disabled flag to the driver's struct pds_vfio_lm_file that gets set during cleanup. Then make sure to check the flag when the migration file is accessed via its file_operations. By default this flag will be false as the memory for struct pds_vfio_lm_file is kzalloc'd, which means the struct pds_vfio_lm_file is enabled and accessible. Also, since the file_operations and driver's migration file cleanup happen under the protection of the same pds_vfio_lm_file.lock, using this flag is thread safe. Fixes: 8512ed256334 ("vfio/pds: Always clear the save/restore FDs on reset") Reviewed-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Brett Creeley <brett.creeley@amd.com> Link: https://lore.kernel.org/r/20240308182149.22036-2-brett.creeley@amd.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2024-03-08 10:21:48 -08:00
if (lm_file->disabled) {
done = -ENODEV;
goto out_unlock;
}
2023-08-07 13:57:52 -07:00
while (len) {
size_t page_offset;
struct page *page;
size_t page_len;
u8 *to_buff;
int err;
page_offset = (*pos) % PAGE_SIZE;
page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
if (!page) {
if (done == 0)
done = -EINVAL;
goto out_unlock;
}
page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
to_buff = kmap_local_page(page);
err = copy_from_user(to_buff + page_offset, buf, page_len);
kunmap_local(to_buff);
if (err) {
done = -EFAULT;
goto out_unlock;
}
*pos += page_len;
len -= page_len;
done += page_len;
buf += page_len;
lm_file->size += page_len;
}
out_unlock:
mutex_unlock(&lm_file->lock);
return done;
}
static const struct file_operations pds_vfio_restore_fops = {
.owner = THIS_MODULE,
.write = pds_vfio_restore_write,
.release = pds_vfio_release_file,
};
static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
{
struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
struct pds_vfio_lm_file *lm_file;
u64 size;
size = sizeof(union pds_lm_dev_state);
dev_dbg(dev, "restore status, size = %lld\n", size);
if (!size) {
dev_err(dev, "invalid state size");
return -EIO;
}
lm_file = pds_vfio_get_lm_file(&pds_vfio_restore_fops, O_WRONLY, size);
if (!lm_file) {
dev_err(dev, "failed to create restore file");
return -ENOENT;
}
pds_vfio->restore_file = lm_file;
return 0;
}
struct file *
pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
enum vfio_device_mig_state next)
{
enum vfio_device_mig_state cur = pds_vfio->state;
int err;
if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
err = pds_vfio_get_save_file(pds_vfio);
if (err)
return ERR_PTR(err);
err = pds_vfio_get_lm_state_cmd(pds_vfio);
if (err) {
pds_vfio_put_save_file(pds_vfio);
return ERR_PTR(err);
}
return pds_vfio->save_file->filep;
}
if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
pds_vfio_put_save_file(pds_vfio);
vfio/pds: Add support for dirty page tracking In order to support dirty page tracking, the driver has to implement the VFIO subsystem's vfio_log_ops. This includes log_start, log_stop, and log_read_and_clear. All of the tracker resources are allocated and dirty tracking on the device is started during log_start. The resources are cleaned up and dirty tracking on the device is stopped during log_stop. The dirty pages are determined and reported during log_read_and_clear. In order to support these callbacks admin queue commands are used. All of the adminq queue command structures and implementations are included as part of this patch. PDS_LM_CMD_DIRTY_STATUS is added to query the current status of dirty tracking on the device. This includes if it's enabled (i.e. number of regions being tracked from the device's perspective) and the maximum number of regions supported from the device's perspective. PDS_LM_CMD_DIRTY_ENABLE is added to enable dirty tracking on the specified number of regions and their iova ranges. PDS_LM_CMD_DIRTY_DISABLE is added to disable dirty tracking for all regions on the device. PDS_LM_CMD_READ_SEQ and PDS_LM_CMD_DIRTY_WRITE_ACK are added to support reading and acknowledging the currently dirtied pages. Signed-off-by: Brett Creeley <brett.creeley@amd.com> Signed-off-by: Shannon Nelson <shannon.nelson@amd.com> Reviewed-by: Simon Horman <horms@kernel.org> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> Link: https://lore.kernel.org/r/20230807205755.29579-7-brett.creeley@amd.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2023-08-07 13:57:53 -07:00
pds_vfio_dirty_disable(pds_vfio, true);
2023-08-07 13:57:52 -07:00
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
err = pds_vfio_get_restore_file(pds_vfio);
if (err)
return ERR_PTR(err);
return pds_vfio->restore_file->filep;
}
if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
err = pds_vfio_set_lm_state_cmd(pds_vfio);
if (err)
return ERR_PTR(err);
pds_vfio_put_restore_file(pds_vfio);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
PDS_LM_STA_IN_PROGRESS);
err = pds_vfio_suspend_device_cmd(pds_vfio,
PDS_LM_SUSPEND_RESUME_TYPE_P2P);
if (err)
return ERR_PTR(err);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
err = pds_vfio_resume_device_cmd(pds_vfio,
PDS_LM_SUSPEND_RESUME_TYPE_FULL);
if (err)
return ERR_PTR(err);
pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
err = pds_vfio_resume_device_cmd(pds_vfio,
PDS_LM_SUSPEND_RESUME_TYPE_P2P);
if (err)
return ERR_PTR(err);
return NULL;
}
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) {
err = pds_vfio_suspend_device_cmd(pds_vfio,
PDS_LM_SUSPEND_RESUME_TYPE_FULL);
if (err)
return ERR_PTR(err);
return NULL;
}
return ERR_PTR(-EINVAL);
}