virtio, vhost: features, fixes

vhost can now support legacy threading
 	if enabled in Kconfig
 vsock memory allocation strategies for
 	large buffers have been improved,
 	reducing pressure on kmalloc
 vhost now supports the in-order feature
 	guest bits missed the merge window
 
 fixes, cleanups all over the place
 
 Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQFDBAABCgAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmiMvQEPHG1zdEByZWRo
 YXQuY29tAAoJECgfDbjSjVRpgr8IAKUrIjqqTYXLkbCWn6tK8T+LxZ6LkMkyHA1v
 AJ+y5fKDeLsT5QpusD1XRjXJVqXBwQEsTN0pNVuhWHlcCpUeOFEHuJaf/QMncbc3
 deFlUfMa3ihniUxBuyhojlWURsf94uTC906lCFXlIsfSKH2CW6/SjKvqR0SH5PhN
 5WaqRYiSFFwDlyG2Ul4e5temP/er2KuZfYyvcYCU8VdSEp6bjvqCHd9ztFIVuByp
 fFWsrHce6IqR8ixOOzavEjzfd8WAN3LGzXntj5KEaX3fZ6HxCZCMv+rNVqvJmLps
 cSrTgIUo60nCiZb8klUCS1YTEEvmdmJg3UmmddIpIhcsCYJSbOU=
 =2dxm
 -----END PGP SIGNATURE-----

Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull virtio updates from Michael Tsirkin:

 - vhost can now support legacy threading if enabled in Kconfig

 - vsock memory allocation strategies for large buffers have been
   improved, reducing pressure on kmalloc

 - vhost now supports the in-order feature. guest bits missed the merge
   window.

 - fixes, cleanups all over the place

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (30 commits)
  vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers
  vsock/virtio: Rename virtio_vsock_skb_rx_put()
  vhost/vsock: Allocate nonlinear SKBs for handling large receive buffers
  vsock/virtio: Move SKB allocation lower-bound check to callers
  vsock/virtio: Rename virtio_vsock_alloc_skb()
  vsock/virtio: Resize receive buffers so that each SKB fits in a 4K page
  vsock/virtio: Move length check to callers of virtio_vsock_skb_rx_put()
  vsock/virtio: Validate length in packet header before skb_put()
  vhost/vsock: Avoid allocating arbitrarily-sized SKBs
  vhost_net: basic in_order support
  vhost: basic in order support
  vhost: fail early when __vhost_add_used() fails
  vhost: Reintroduce kthread API and add mode selection
  vdpa: Fix IDR memory leak in VDUSE module exit
  vdpa/mlx5: Fix release of uninitialized resources on error path
  vhost-scsi: Fix check for inline_sg_cnt exceeding preallocated limit
  virtio: virtio_dma_buf: fix missing parameter documentation
  vhost: Fix typos
  vhost: vringh: Remove unused functions
  vhost: vringh: Remove unused iotlb functions
  ...
This commit is contained in:
Linus Torvalds 2025-08-01 14:17:48 -07:00
commit 821c9e515d
23 changed files with 574 additions and 342 deletions

View file

@ -130,10 +130,10 @@ static void virtio_gpu_remove(struct virtio_device *vdev)
static void virtio_gpu_shutdown(struct virtio_device *vdev)
{
/*
* drm does its own synchronization on shutdown.
* Do nothing here, opt out of device reset.
*/
struct drm_device *dev = vdev->priv;
/* stop talking to the device */
drm_dev_unplug(dev);
}
static void virtio_gpu_config_changed(struct virtio_device *vdev)

View file

@ -908,6 +908,9 @@ void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
{
struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
if (!mres->wq_gc)
return;
atomic_set(&mres->shutdown, 1);
flush_delayed_work(&mres->gc_dwork_ent);

View file

@ -2491,7 +2491,7 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
}
mvq = &ndev->vqs[idx];
ndev->needs_teardown = num != mvq->num_ent;
ndev->needs_teardown |= num != mvq->num_ent;
mvq->num_ent = num;
}
@ -3432,15 +3432,17 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
ndev = to_mlx5_vdpa_ndev(mvdev);
/* Functions called here should be able to work with
* uninitialized resources.
*/
free_fixed_resources(ndev);
mlx5_vdpa_clean_mrs(mvdev);
mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx);
if (!is_zero_ether_addr(ndev->config.mac)) {
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
}
mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx);
mlx5_vdpa_free_resources(&ndev->mvdev);
free_irqs(ndev);
kfree(ndev->event_cbs);
@ -3888,6 +3890,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
mvdev->actual_features =
(device_features & BIT_ULL(VIRTIO_F_VERSION_1));
mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx);
ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
if (!ndev->vqs || !ndev->event_cbs) {
@ -3960,8 +3964,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
ndev->rqt_size = 1;
}
mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx);
ndev->mvdev.mlx_features = device_features;
mvdev->vdev.dma_dev = &mdev->pdev->dev;
err = mlx5_vdpa_alloc_resources(&ndev->mvdev);

View file

@ -2216,6 +2216,7 @@ static void vduse_exit(void)
cdev_del(&vduse_ctrl_cdev);
unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
class_unregister(&vduse_class);
idr_destroy(&vduse_idr);
}
module_exit(vduse_exit);

View file

@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY
If unsure, say "N".
config VHOST_ENABLE_FORK_OWNER_CONTROL
bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL"
default y
help
This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and
VHOST_GET_FORK_FROM_OWNER. These allow userspace applications
to modify the vhost worker mode for vhost devices.
Also expose module parameter 'fork_from_owner_default' to allow users
to configure the default mode for vhost workers.
By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`,
users can change the worker thread mode as needed.
If this config is disabled (n),the related IOCTLs and parameters will
be unavailable.
If unsure, say "Y".
endif

View file

@ -74,7 +74,8 @@ static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = {
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
(1ULL << VIRTIO_F_ACCESS_PLATFORM) |
(1ULL << VIRTIO_F_RING_RESET),
(1ULL << VIRTIO_F_RING_RESET) |
(1ULL << VIRTIO_F_IN_ORDER),
VIRTIO_BIT(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) |
VIRTIO_BIT(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO),
};
@ -376,7 +377,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
while (j) {
add = min(UIO_MAXIOV - nvq->done_idx, j);
vhost_add_used_and_signal_n(vq->dev, vq,
&vq->heads[nvq->done_idx], add);
&vq->heads[nvq->done_idx],
NULL, add);
nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
j -= add;
}
@ -451,7 +453,8 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
}
static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq,
unsigned int count)
{
struct vhost_virtqueue *vq = &nvq->vq;
struct vhost_dev *dev = vq->dev;
@ -459,7 +462,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
if (!nvq->done_idx)
return;
vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
vhost_add_used_and_signal_n(dev, vq, vq->heads,
vq->nheads, count);
nvq->done_idx = 0;
}
@ -468,6 +472,8 @@ static void vhost_tx_batch(struct vhost_net *net,
struct socket *sock,
struct msghdr *msghdr)
{
struct vhost_virtqueue *vq = &nvq->vq;
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
struct tun_msg_ctl ctl = {
.type = TUN_MSG_PTR,
.num = nvq->batched_xdp,
@ -475,6 +481,11 @@ static void vhost_tx_batch(struct vhost_net *net,
};
int i, err;
if (in_order) {
vq->heads[0].len = 0;
vq->nheads[0] = nvq->done_idx;
}
if (nvq->batched_xdp == 0)
goto signal_used;
@ -496,7 +507,7 @@ static void vhost_tx_batch(struct vhost_net *net,
}
signal_used:
vhost_net_signal_used(nvq);
vhost_net_signal_used(nvq, in_order ? 1 : nvq->done_idx);
nvq->batched_xdp = 0;
}
@ -750,6 +761,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
int sent_pkts = 0;
bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
bool busyloop_intr;
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
do {
busyloop_intr = false;
@ -786,11 +798,13 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
break;
}
/* We can't build XDP buff, go for single
* packet path but let's flush batched
* packets.
*/
vhost_tx_batch(net, nvq, sock, &msg);
if (nvq->batched_xdp) {
/* We can't build XDP buff, go for single
* packet path but let's flush batched
* packets.
*/
vhost_tx_batch(net, nvq, sock, &msg);
}
msg.msg_control = NULL;
} else {
if (tx_can_batch(vq, total_len))
@ -811,8 +825,12 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
pr_debug("Truncated TX packet: len %d != %zd\n",
err, len);
done:
vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->done_idx].len = 0;
if (in_order) {
vq->heads[0].id = cpu_to_vhost32(vq, head);
} else {
vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->done_idx].len = 0;
}
++nvq->done_idx;
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
@ -991,7 +1009,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
}
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
bool *busyloop_intr)
bool *busyloop_intr, unsigned int count)
{
struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
@ -1001,7 +1019,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
if (!len && rvq->busyloop_timeout) {
/* Flush batched heads first */
vhost_net_signal_used(rnvq);
vhost_net_signal_used(rnvq, count);
/* Both tx vq and rx socket were polled here */
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
@ -1013,7 +1031,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
/* This is a multi-buffer version of vhost_get_desc, that works if
* vq has read descriptors only.
* @vq - the relevant virtqueue
* @nvq - the relevant vhost_net virtqueue
* @datalen - data length we'll be reading
* @iovcount - returned count of io vectors we fill
* @log - vhost log
@ -1021,14 +1039,17 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
* @quota - headcount quota, 1 for big buffer
* returns number of buffer heads allocated, negative on error
*/
static int get_rx_bufs(struct vhost_virtqueue *vq,
static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
struct vring_used_elem *heads,
u16 *nheads,
int datalen,
unsigned *iovcount,
struct vhost_log *log,
unsigned *log_num,
unsigned int quota)
{
struct vhost_virtqueue *vq = &nvq->vq;
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
unsigned int out, in;
int seg = 0;
int headcount = 0;
@ -1065,14 +1086,16 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
heads[headcount].id = cpu_to_vhost32(vq, d);
len = iov_length(vq->iov + seg, in);
heads[headcount].len = cpu_to_vhost32(vq, len);
datalen -= len;
if (!in_order) {
heads[headcount].id = cpu_to_vhost32(vq, d);
heads[headcount].len = cpu_to_vhost32(vq, len);
}
++headcount;
datalen -= len;
seg += in;
}
heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
*iovcount = seg;
if (unlikely(log))
*log_num = nlogs;
@ -1082,6 +1105,15 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
r = UIO_MAXIOV + 1;
goto err;
}
if (!in_order)
heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
else {
heads[0].len = cpu_to_vhost32(vq, len + datalen);
heads[0].id = cpu_to_vhost32(vq, d);
nheads[0] = headcount;
}
return headcount;
err:
vhost_discard_vq_desc(vq, headcount);
@ -1094,6 +1126,8 @@ static void handle_rx(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_virtqueue *vq = &nvq->vq;
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
unsigned int count = 0;
unsigned in, log;
struct vhost_log *vq_log;
struct msghdr msg = {
@ -1141,12 +1175,13 @@ static void handle_rx(struct vhost_net *net)
do {
sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
&busyloop_intr);
&busyloop_intr, count);
if (!sock_len)
break;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
headcount = get_rx_bufs(nvq, vq->heads + count,
vq->nheads + count,
vhost_len, &in, vq_log, &log,
likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
@ -1222,8 +1257,11 @@ static void handle_rx(struct vhost_net *net)
goto out;
}
nvq->done_idx += headcount;
if (nvq->done_idx > VHOST_NET_BATCH)
vhost_net_signal_used(nvq);
count += in_order ? 1 : headcount;
if (nvq->done_idx > VHOST_NET_BATCH) {
vhost_net_signal_used(nvq, count);
count = 0;
}
if (unlikely(vq_log))
vhost_log_write(vq, vq_log, log, vhost_len,
vq->iov, in);
@ -1235,7 +1273,7 @@ static void handle_rx(struct vhost_net *net)
else if (!sock_len)
vhost_net_enable_vq(net, vq);
out:
vhost_net_signal_used(nvq);
vhost_net_signal_used(nvq, count);
mutex_unlock(&vq->mutex);
}

View file

@ -71,7 +71,7 @@ static int vhost_scsi_set_inline_sg_cnt(const char *buf,
if (ret)
return ret;
if (ret > VHOST_SCSI_PREALLOC_SGLS) {
if (cnt > VHOST_SCSI_PREALLOC_SGLS) {
pr_err("Max inline_sg_cnt is %u\n", VHOST_SCSI_PREALLOC_SGLS);
return -EINVAL;
}
@ -152,7 +152,7 @@ struct vhost_scsi_nexus {
struct vhost_scsi_tpg {
/* Vhost port target portal group tag for TCM */
u16 tport_tpgt;
/* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
/* Used to track number of TPG Port/Lun Links wrt to explicit I_T Nexus shutdown */
int tv_tpg_port_count;
/* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
int tv_tpg_vhost_count;
@ -311,12 +311,12 @@ static void vhost_scsi_init_inflight(struct vhost_scsi *vs,
mutex_lock(&vq->mutex);
/* store old infight */
/* store old inflight */
idx = vs->vqs[i].inflight_idx;
if (old_inflight)
old_inflight[i] = &vs->vqs[i].inflights[idx];
/* setup new infight */
/* setup new inflight */
vs->vqs[i].inflight_idx = idx ^ 1;
new_inflight = &vs->vqs[i].inflights[idx ^ 1];
kref_init(&new_inflight->kref);
@ -1226,10 +1226,8 @@ vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc,
/* validated at handler entry */
vs_tpg = vhost_vq_get_backend(vq);
tpg = READ_ONCE(vs_tpg[*vc->target]);
if (unlikely(!tpg)) {
vq_err(vq, "Target 0x%x does not exist\n", *vc->target);
if (unlikely(!tpg))
goto out;
}
}
if (tpgp)
@ -1249,7 +1247,7 @@ vhost_scsi_setup_resp_iovs(struct vhost_scsi_cmd *cmd, struct iovec *in_iovs,
if (!in_iovs_cnt)
return 0;
/*
* Initiator's normally just put the virtio_scsi_cmd_resp in the first
* Initiators normally just put the virtio_scsi_cmd_resp in the first
* iov, but just in case they wedged in some data with it we check for
* greater than or equal to the response struct.
*/
@ -1457,7 +1455,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
cmd = vhost_scsi_get_cmd(vq, tag);
if (IS_ERR(cmd)) {
ret = PTR_ERR(cmd);
vq_err(vq, "vhost_scsi_get_tag failed %dd\n", ret);
vq_err(vq, "vhost_scsi_get_tag failed %d\n", ret);
goto err;
}
cmd->tvc_vq = vq;
@ -2609,7 +2607,7 @@ static int vhost_scsi_make_nexus(struct vhost_scsi_tpg *tpg,
return -ENOMEM;
}
/*
* Since we are running in 'demo mode' this call with generate a
* Since we are running in 'demo mode' this call will generate a
* struct se_node_acl for the vhost_scsi struct se_portal_group with
* the SCSI Initiator port name of the passed configfs group 'name'.
*/
@ -2915,7 +2913,7 @@ static ssize_t
vhost_scsi_wwn_version_show(struct config_item *item, char *page)
{
return sysfs_emit(page, "TCM_VHOST fabric module %s on %s/%s"
"on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname,
" on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname,
utsname()->machine);
}
@ -2983,13 +2981,13 @@ out_vhost_scsi_deregister:
vhost_scsi_deregister();
out:
return ret;
};
}
static void vhost_scsi_exit(void)
{
target_unregister_template(&vhost_scsi_ops);
vhost_scsi_deregister();
};
}
MODULE_DESCRIPTION("VHOST_SCSI series fabric driver");
MODULE_ALIAS("tcm_vhost");

View file

@ -22,6 +22,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/kthread.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sort.h>
#include <linux/sched/mm.h>
@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048;
module_param(max_iotlb_entries, int, 0444);
MODULE_PARM_DESC(max_iotlb_entries,
"Maximum number of iotlb entries. (default: 2048)");
static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK;
#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
module_param(fork_from_owner_default, bool, 0444);
MODULE_PARM_DESC(fork_from_owner_default,
"Set task mode as the default(default: Y)");
#endif
enum {
VHOST_MEMORY_F_LOG = 0x1,
@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker,
* test_and_set_bit() implies a memory barrier.
*/
llist_add(&work->node, &worker->work_list);
vhost_task_wake(worker->vtsk);
worker->ops->wakeup(worker);
}
}
@ -364,6 +372,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->avail = NULL;
vq->used = NULL;
vq->last_avail_idx = 0;
vq->next_avail_head = 0;
vq->avail_idx = 0;
vq->last_used_idx = 0;
vq->signalled_used = 0;
@ -388,6 +397,44 @@ static void vhost_vq_reset(struct vhost_dev *dev,
__vhost_vq_meta_reset(vq);
}
static int vhost_run_work_kthread_list(void *data)
{
struct vhost_worker *worker = data;
struct vhost_work *work, *work_next;
struct vhost_dev *dev = worker->dev;
struct llist_node *node;
kthread_use_mm(dev->mm);
for (;;) {
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
break;
}
node = llist_del_all(&worker->work_list);
if (!node)
schedule();
node = llist_reverse_order(node);
/* make sure flag is seen after deletion */
smp_wmb();
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, &work->flags);
__set_current_state(TASK_RUNNING);
kcov_remote_start_common(worker->kcov_handle);
work->fn(work);
kcov_remote_stop();
cond_resched();
}
}
kthread_unuse_mm(dev->mm);
return 0;
}
static bool vhost_run_work_list(void *data)
{
struct vhost_worker *worker = data;
@ -455,6 +502,8 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
vq->log = NULL;
kfree(vq->heads);
vq->heads = NULL;
kfree(vq->nheads);
vq->nheads = NULL;
}
/* Helper to allocate iovec buffers for all vqs. */
@ -472,7 +521,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
GFP_KERNEL);
vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
GFP_KERNEL);
if (!vq->indirect || !vq->log || !vq->heads)
vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads),
GFP_KERNEL);
if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads)
goto err_nomem;
}
return 0;
@ -552,6 +603,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
dev->msg_handler = msg_handler;
dev->fork_owner = fork_from_owner_default;
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
@ -581,6 +633,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
struct vhost_attach_cgroups_struct {
struct vhost_work work;
struct task_struct *owner;
int ret;
};
static void vhost_attach_cgroups_work(struct vhost_work *work)
{
struct vhost_attach_cgroups_struct *s;
s = container_of(work, struct vhost_attach_cgroups_struct, work);
s->ret = cgroup_attach_task_all(s->owner, current);
}
static int vhost_attach_task_to_cgroups(struct vhost_worker *worker)
{
struct vhost_attach_cgroups_struct attach;
int saved_cnt;
attach.owner = current;
vhost_work_init(&attach.work, vhost_attach_cgroups_work);
vhost_worker_queue(worker, &attach.work);
mutex_lock(&worker->mutex);
/*
* Bypass attachment_cnt check in __vhost_worker_flush:
* Temporarily change it to INT_MAX to bypass the check
*/
saved_cnt = worker->attachment_cnt;
worker->attachment_cnt = INT_MAX;
__vhost_worker_flush(worker);
worker->attachment_cnt = saved_cnt;
mutex_unlock(&worker->mutex);
return attach.ret;
}
/* Caller should have device mutex */
bool vhost_dev_has_owner(struct vhost_dev *dev)
{
@ -594,10 +686,10 @@ static void vhost_attach_mm(struct vhost_dev *dev)
if (dev->use_worker) {
dev->mm = get_task_mm(current);
} else {
/* vDPA device does not use worker thead, so there's
* no need to hold the address space for mm. This help
/* vDPA device does not use worker thread, so there's
* no need to hold the address space for mm. This helps
* to avoid deadlock in the case of mmap() which may
* held the refcnt of the file and depends on release
* hold the refcnt of the file and depends on release
* method to remove vma.
*/
dev->mm = current->mm;
@ -626,7 +718,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev,
WARN_ON(!llist_empty(&worker->work_list));
xa_erase(&dev->worker_xa, worker->id);
vhost_task_stop(worker->vtsk);
worker->ops->stop(worker);
kfree(worker);
}
@ -649,42 +741,115 @@ static void vhost_workers_free(struct vhost_dev *dev)
xa_destroy(&dev->worker_xa);
}
static void vhost_task_wakeup(struct vhost_worker *worker)
{
return vhost_task_wake(worker->vtsk);
}
static void vhost_kthread_wakeup(struct vhost_worker *worker)
{
wake_up_process(worker->kthread_task);
}
static void vhost_task_do_stop(struct vhost_worker *worker)
{
return vhost_task_stop(worker->vtsk);
}
static void vhost_kthread_do_stop(struct vhost_worker *worker)
{
kthread_stop(worker->kthread_task);
}
static int vhost_task_worker_create(struct vhost_worker *worker,
struct vhost_dev *dev, const char *name)
{
struct vhost_task *vtsk;
u32 id;
int ret;
vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
worker, name);
if (IS_ERR(vtsk))
return PTR_ERR(vtsk);
worker->vtsk = vtsk;
vhost_task_start(vtsk);
ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
if (ret < 0) {
vhost_task_do_stop(worker);
return ret;
}
worker->id = id;
return 0;
}
static int vhost_kthread_worker_create(struct vhost_worker *worker,
struct vhost_dev *dev, const char *name)
{
struct task_struct *task;
u32 id;
int ret;
task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
if (IS_ERR(task))
return PTR_ERR(task);
worker->kthread_task = task;
wake_up_process(task);
ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
if (ret < 0)
goto stop_worker;
ret = vhost_attach_task_to_cgroups(worker);
if (ret)
goto stop_worker;
worker->id = id;
return 0;
stop_worker:
vhost_kthread_do_stop(worker);
return ret;
}
static const struct vhost_worker_ops kthread_ops = {
.create = vhost_kthread_worker_create,
.stop = vhost_kthread_do_stop,
.wakeup = vhost_kthread_wakeup,
};
static const struct vhost_worker_ops vhost_task_ops = {
.create = vhost_task_worker_create,
.stop = vhost_task_do_stop,
.wakeup = vhost_task_wakeup,
};
static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
{
struct vhost_worker *worker;
struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
int ret;
u32 id;
const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops :
&kthread_ops;
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
return NULL;
worker->dev = dev;
worker->ops = ops;
snprintf(name, sizeof(name), "vhost-%d", current->pid);
vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
worker, name);
if (IS_ERR(vtsk))
goto free_worker;
mutex_init(&worker->mutex);
init_llist_head(&worker->work_list);
worker->kcov_handle = kcov_common_handle();
worker->vtsk = vtsk;
vhost_task_start(vtsk);
ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
ret = ops->create(worker, dev, name);
if (ret < 0)
goto stop_worker;
worker->id = id;
goto free_worker;
return worker;
stop_worker:
vhost_task_stop(vtsk);
free_worker:
kfree(worker);
return NULL;
@ -731,7 +896,7 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
* We don't want to call synchronize_rcu for every vq during setup
* because it will slow down VM startup. If we haven't done
* VHOST_SET_VRING_KICK and not done the driver specific
* SET_ENDPOINT/RUNNUNG then we can skip the sync since there will
* SET_ENDPOINT/RUNNING then we can skip the sync since there will
* not be any works queued for scsi and net.
*/
mutex_lock(&vq->mutex);
@ -865,6 +1030,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
switch (ioctl) {
/* dev worker ioctls */
case VHOST_NEW_WORKER:
/*
* vhost_tasks will account for worker threads under the parent's
* NPROC value but kthreads do not. To avoid userspace overflowing
* the system with worker threads fork_owner must be true.
*/
if (!dev->fork_owner)
return -EFAULT;
ret = vhost_new_worker(dev, &state);
if (!ret && copy_to_user(argp, &state, sizeof(state)))
ret = -EFAULT;
@ -982,6 +1155,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
vhost_dev_cleanup(dev);
dev->fork_owner = fork_from_owner_default;
dev->umem = umem;
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
* VQs aren't running.
@ -1990,14 +2164,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
break;
}
if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
vq->last_avail_idx = s.num & 0xffff;
vq->next_avail_head = vq->last_avail_idx =
s.num & 0xffff;
vq->last_used_idx = (s.num >> 16) & 0xffff;
} else {
if (s.num > 0xffff) {
r = -EINVAL;
break;
}
vq->last_avail_idx = s.num;
vq->next_avail_head = vq->last_avail_idx = s.num;
}
/* Forget the cached index value. */
vq->avail_idx = vq->last_avail_idx;
@ -2135,6 +2310,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
goto done;
}
#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
if (ioctl == VHOST_SET_FORK_FROM_OWNER) {
/* Only allow modification before owner is set */
if (vhost_dev_has_owner(d)) {
r = -EBUSY;
goto done;
}
u8 fork_owner_val;
if (get_user(fork_owner_val, (u8 __user *)argp)) {
r = -EFAULT;
goto done;
}
if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
r = -EINVAL;
goto done;
}
d->fork_owner = !!fork_owner_val;
r = 0;
goto done;
}
if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
u8 fork_owner_val = d->fork_owner;
if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
r = -EINVAL;
goto done;
}
if (put_user(fork_owner_val, (u8 __user *)argp)) {
r = -EFAULT;
goto done;
}
r = 0;
goto done;
}
#endif
/* You must be the owner to do anything else */
r = vhost_dev_check_owner(d);
if (r)
@ -2590,11 +2804,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num)
{
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
struct vring_desc desc;
unsigned int i, head, found = 0;
u16 last_avail_idx = vq->last_avail_idx;
__virtio16 ring_head;
int ret, access;
int ret, access, c = 0;
if (vq->avail_idx == vq->last_avail_idx) {
ret = vhost_get_avail_idx(vq);
@ -2605,17 +2820,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
return vq->num;
}
/* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */
if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
vq_err(vq, "Failed to read head: idx %d address %p\n",
last_avail_idx,
&vq->avail->ring[last_avail_idx % vq->num]);
return -EFAULT;
if (in_order)
head = vq->next_avail_head & (vq->num - 1);
else {
/* Grab the next descriptor number they're
* advertising, and increment the index we've seen. */
if (unlikely(vhost_get_avail_head(vq, &ring_head,
last_avail_idx))) {
vq_err(vq, "Failed to read head: idx %d address %p\n",
last_avail_idx,
&vq->avail->ring[last_avail_idx % vq->num]);
return -EFAULT;
}
head = vhost16_to_cpu(vq, ring_head);
}
head = vhost16_to_cpu(vq, ring_head);
/* If their number is silly, that's an error. */
if (unlikely(head >= vq->num)) {
vq_err(vq, "Guest says index %u > %u is available",
@ -2658,6 +2877,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
"in indirect descriptor at idx %d\n", i);
return ret;
}
++c;
continue;
}
@ -2693,10 +2913,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
*out_num += ret;
}
++c;
} while ((i = next_desc(vq, &desc)) != -1);
/* On success, increment avail index. */
vq->last_avail_idx++;
vq->next_avail_head += c;
/* Assume notifications from guest are disabled at this point,
* if they aren't we would need to update avail_event index. */
@ -2720,8 +2942,9 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
cpu_to_vhost32(vq, head),
cpu_to_vhost32(vq, len)
};
u16 nheads = 1;
return vhost_add_used_n(vq, &heads, 1);
return vhost_add_used_n(vq, &heads, &nheads, 1);
}
EXPORT_SYMBOL_GPL(vhost_add_used);
@ -2757,10 +2980,9 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
return 0;
}
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
unsigned count)
static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
unsigned count)
{
int start, n, r;
@ -2773,7 +2995,72 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
heads += n;
count -= n;
}
r = __vhost_add_used_n(vq, heads, count);
return __vhost_add_used_n(vq, heads, count);
}
static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
const u16 *nheads,
unsigned count)
{
vring_used_elem_t __user *used;
u16 old, new = vq->last_used_idx;
int start, i;
if (!nheads)
return -EINVAL;
start = vq->last_used_idx & (vq->num - 1);
used = vq->used->ring + start;
for (i = 0; i < count; i++) {
if (vhost_put_used(vq, &heads[i], start, 1)) {
vq_err(vq, "Failed to write used");
return -EFAULT;
}
start += nheads[i];
new += nheads[i];
if (start >= vq->num)
start -= vq->num;
}
if (unlikely(vq->log_used)) {
/* Make sure data is seen before log. */
smp_wmb();
/* Log used ring entry write. */
log_used(vq, ((void __user *)used - (void __user *)vq->used),
(vq->num - start) * sizeof *used);
if (start + count > vq->num)
log_used(vq, 0,
(start + count - vq->num) * sizeof *used);
}
old = vq->last_used_idx;
vq->last_used_idx = new;
/* If the driver never bothers to signal in a very long while,
* used index might wrap around. If that happens, invalidate
* signalled_used index we stored. TODO: make sure driver
* signals at least once in 2^16 and remove this. */
if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
vq->signalled_used_valid = false;
return 0;
}
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
u16 *nheads, unsigned count)
{
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
int r;
if (!in_order || !nheads)
r = vhost_add_used_n_ooo(vq, heads, count);
else
r = vhost_add_used_n_in_order(vq, heads, nheads, count);
if (r < 0)
return r;
/* Make sure buffer is written before we update index. */
smp_wmb();
@ -2853,14 +3140,16 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
/* multi-buffer version of vhost_add_used_and_signal */
void vhost_add_used_and_signal_n(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
struct vring_used_elem *heads, unsigned count)
struct vring_used_elem *heads,
u16 *nheads,
unsigned count)
{
vhost_add_used_n(vq, heads, count);
vhost_add_used_n(vq, heads, nheads, count);
vhost_signal(dev, vq);
}
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
/* return true if we're sure that avaiable ring is empty */
/* return true if we're sure that available ring is empty */
bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
int r;

View file

@ -26,7 +26,18 @@ struct vhost_work {
unsigned long flags;
};
struct vhost_worker;
struct vhost_dev;
struct vhost_worker_ops {
int (*create)(struct vhost_worker *worker, struct vhost_dev *dev,
const char *name);
void (*stop)(struct vhost_worker *worker);
void (*wakeup)(struct vhost_worker *worker);
};
struct vhost_worker {
struct task_struct *kthread_task;
struct vhost_task *vtsk;
struct vhost_dev *dev;
/* Used to serialize device wide flushing with worker swapping. */
@ -36,6 +47,7 @@ struct vhost_worker {
u32 id;
int attachment_cnt;
bool killed;
const struct vhost_worker_ops *ops;
};
/* Poll a file (eventfd or socket) */
@ -103,6 +115,8 @@ struct vhost_virtqueue {
* Values are limited to 0x7fff, and the high bit is used as
* a wrap counter when using VIRTIO_F_RING_PACKED. */
u16 last_avail_idx;
/* Next avail ring head when VIRTIO_F_IN_ORDER is negoitated */
u16 next_avail_head;
/* Caches available index value from user. */
u16 avail_idx;
@ -129,6 +143,7 @@ struct vhost_virtqueue {
struct iovec iotlb_iov[64];
struct iovec *indirect;
struct vring_used_elem *heads;
u16 *nheads;
/* Protected by virtqueue mutex. */
struct vhost_iotlb *umem;
struct vhost_iotlb *iotlb;
@ -176,6 +191,16 @@ struct vhost_dev {
int byte_weight;
struct xarray worker_xa;
bool use_worker;
/*
* If fork_owner is true we use vhost_tasks to create
* the worker so all settings/limits like cgroups, NPROC,
* scheduler, etc are inherited from the owner. If false,
* we use kthreads and only attach to the same cgroups
* as the owner for compat with older kernels.
* here we use true as default value.
* The default value is set by fork_from_owner_default
*/
bool fork_owner;
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
struct vhost_iotlb_msg *msg);
};
@ -213,11 +238,12 @@ bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
unsigned count);
u16 *nheads, unsigned count);
void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
unsigned int id, int len);
void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
struct vring_used_elem *heads, unsigned count);
struct vring_used_elem *heads, u16 *nheads,
unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);

View file

@ -779,22 +779,6 @@ ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
}
EXPORT_SYMBOL(vringh_iov_push_user);
/**
* vringh_abandon_user - we've decided not to handle the descriptor(s).
* @vrh: the vring.
* @num: the number of descriptors to put back (ie. num
* vringh_get_user() to undo).
*
* The next vringh_get_user() will return the old descriptor(s) again.
*/
void vringh_abandon_user(struct vringh *vrh, unsigned int num)
{
/* We only update vring_avail_event(vr) when we want to be notified,
* so we haven't changed that yet. */
vrh->last_avail_idx -= num;
}
EXPORT_SYMBOL(vringh_abandon_user);
/**
* vringh_complete_user - we've finished with descriptor, publish it.
* @vrh: the vring.
@ -900,20 +884,6 @@ static inline int putused_kern(const struct vringh *vrh,
return 0;
}
static inline int xfer_kern(const struct vringh *vrh, void *src,
void *dst, size_t len)
{
memcpy(dst, src, len);
return 0;
}
static inline int kern_xfer(const struct vringh *vrh, void *dst,
void *src, size_t len)
{
memcpy(dst, src, len);
return 0;
}
/**
* vringh_init_kern - initialize a vringh for a kernelspace vring.
* @vrh: the vringh to initialize.
@ -998,51 +968,6 @@ int vringh_getdesc_kern(struct vringh *vrh,
}
EXPORT_SYMBOL(vringh_getdesc_kern);
/**
* vringh_iov_pull_kern - copy bytes from vring_iov.
* @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
* @dst: the place to copy.
* @len: the maximum length to copy.
*
* Returns the bytes copied <= len or a negative errno.
*/
ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
{
return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern);
}
EXPORT_SYMBOL(vringh_iov_pull_kern);
/**
* vringh_iov_push_kern - copy bytes into vring_iov.
* @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
* @src: the place to copy from.
* @len: the maximum length to copy.
*
* Returns the bytes copied <= len or a negative errno.
*/
ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
const void *src, size_t len)
{
return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer);
}
EXPORT_SYMBOL(vringh_iov_push_kern);
/**
* vringh_abandon_kern - we've decided not to handle the descriptor(s).
* @vrh: the vring.
* @num: the number of descriptors to put back (ie. num
* vringh_get_kern() to undo).
*
* The next vringh_get_kern() will return the old descriptor(s) again.
*/
void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
{
/* We only update vring_avail_event(vr) when we want to be notified,
* so we haven't changed that yet. */
vrh->last_avail_idx -= num;
}
EXPORT_SYMBOL(vringh_abandon_kern);
/**
* vringh_complete_kern - we've finished with descriptor, publish it.
* @vrh: the vring.
@ -1534,23 +1459,6 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
}
EXPORT_SYMBOL(vringh_iov_push_iotlb);
/**
* vringh_abandon_iotlb - we've decided not to handle the descriptor(s).
* @vrh: the vring.
* @num: the number of descriptors to put back (ie. num
* vringh_get_iotlb() to undo).
*
* The next vringh_get_iotlb() will return the old descriptor(s) again.
*/
void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num)
{
/* We only update vring_avail_event(vr) when we want to be notified,
* so we haven't changed that yet.
*/
vrh->last_avail_idx -= num;
}
EXPORT_SYMBOL(vringh_abandon_iotlb);
/**
* vringh_complete_iotlb - we've finished with descriptor, publish it.
* @vrh: the vring.
@ -1571,32 +1479,6 @@ int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len)
}
EXPORT_SYMBOL(vringh_complete_iotlb);
/**
* vringh_notify_enable_iotlb - we want to know if something changes.
* @vrh: the vring.
*
* This always enables notifications, but returns false if there are
* now more buffers available in the vring.
*/
bool vringh_notify_enable_iotlb(struct vringh *vrh)
{
return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb);
}
EXPORT_SYMBOL(vringh_notify_enable_iotlb);
/**
* vringh_notify_disable_iotlb - don't tell us if something changes.
* @vrh: the vring.
*
* This is our normal running state: we disable and then only enable when
* we're going to sleep.
*/
void vringh_notify_disable_iotlb(struct vringh *vrh)
{
__vringh_notify_disable(vrh, putu16_iotlb);
}
EXPORT_SYMBOL(vringh_notify_disable_iotlb);
/**
* vringh_need_notify_iotlb - must we tell the other side about used buffers?
* @vrh: the vring we've called vringh_complete_iotlb() on.

View file

@ -344,6 +344,10 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
len = iov_length(vq->iov, out);
if (len < VIRTIO_VSOCK_SKB_HEADROOM ||
len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM)
return NULL;
/* len contains both payload and hdr */
skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
if (!skb)
@ -367,18 +371,15 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
return skb;
/* The pkt is too big or the length in the header is invalid */
if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
payload_len + sizeof(*hdr) > len) {
if (payload_len + sizeof(*hdr) > len) {
kfree_skb(skb);
return NULL;
}
virtio_vsock_skb_rx_put(skb);
virtio_vsock_skb_put(skb, payload_len);
nbytes = copy_from_iter(skb->data, payload_len, &iov_iter);
if (nbytes != payload_len) {
vq_err(vq, "Expected %zu byte payload, got %zu bytes\n",
payload_len, nbytes);
if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) {
vq_err(vq, "Failed to copy %zu byte payload\n", payload_len);
kfree_skb(skb);
return NULL;
}

View file

@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(virtio_config_changed);
/**
* virtio_config_driver_disable - disable config change reporting by drivers
* @dev: the device to reset
* @dev: the device to disable
*
* This is only allowed to be called by a driver and disabling can't
* be nested.
@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(virtio_config_driver_disable);
/**
* virtio_config_driver_enable - enable config change reporting by drivers
* @dev: the device to reset
* @dev: the device to enable
*
* This is only allowed to be called by a driver and enabling can't
* be nested.
@ -512,7 +512,7 @@ out:
* On error, the caller must call put_device on &@dev->dev (and not kfree),
* as another code path may have obtained a reference to @dev.
*
* Returns: 0 on suceess, -error on failure
* Returns: 0 on success, -error on failure
*/
int register_virtio_device(struct virtio_device *dev)
{
@ -536,6 +536,7 @@ int register_virtio_device(struct virtio_device *dev)
goto out_ida_remove;
spin_lock_init(&dev->config_lock);
dev->config_driver_disabled = false;
dev->config_core_enabled = false;
dev->config_change_pending = false;

View file

@ -36,6 +36,8 @@ EXPORT_SYMBOL(virtio_dma_buf_export);
/**
* virtio_dma_buf_attach - mandatory attach callback for virtio dma-bufs
* @dma_buf: [in] buffer to attach
* @attach: [in] attachment structure
*/
int virtio_dma_buf_attach(struct dma_buf *dma_buf,
struct dma_buf_attachment *attach)

View file

@ -65,7 +65,6 @@
#include <linux/platform_device.h>
#include <linux/pm.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/virtio.h>
#include <linux/virtio_config.h>
#include <uapi/linux/virtio_mmio.h>
@ -88,22 +87,8 @@ struct virtio_mmio_device {
void __iomem *base;
unsigned long version;
/* a list of queues so we can dispatch IRQs */
spinlock_t lock;
struct list_head virtqueues;
};
struct virtio_mmio_vq_info {
/* the actual virtqueue */
struct virtqueue *vq;
/* the list node for the virtqueues list */
struct list_head node;
};
/* Configuration interface */
static u64 vm_get_features(struct virtio_device *vdev)
@ -300,9 +285,8 @@ static bool vm_notify_with_data(struct virtqueue *vq)
static irqreturn_t vm_interrupt(int irq, void *opaque)
{
struct virtio_mmio_device *vm_dev = opaque;
struct virtio_mmio_vq_info *info;
struct virtqueue *vq;
unsigned long status;
unsigned long flags;
irqreturn_t ret = IRQ_NONE;
/* Read and acknowledge interrupts */
@ -315,10 +299,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
}
if (likely(status & VIRTIO_MMIO_INT_VRING)) {
spin_lock_irqsave(&vm_dev->lock, flags);
list_for_each_entry(info, &vm_dev->virtqueues, node)
ret |= vring_interrupt(irq, info->vq);
spin_unlock_irqrestore(&vm_dev->lock, flags);
virtio_device_for_each_vq(&vm_dev->vdev, vq)
ret |= vring_interrupt(irq, vq);
}
return ret;
@ -329,14 +311,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
static void vm_del_vq(struct virtqueue *vq)
{
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
struct virtio_mmio_vq_info *info = vq->priv;
unsigned long flags;
unsigned int index = vq->index;
spin_lock_irqsave(&vm_dev->lock, flags);
list_del(&info->node);
spin_unlock_irqrestore(&vm_dev->lock, flags);
/* Select and deactivate the queue */
writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
if (vm_dev->version == 1) {
@ -347,8 +323,6 @@ static void vm_del_vq(struct virtqueue *vq)
}
vring_del_virtqueue(vq);
kfree(info);
}
static void vm_del_vqs(struct virtio_device *vdev)
@ -375,9 +349,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
{
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
bool (*notify)(struct virtqueue *vq);
struct virtio_mmio_vq_info *info;
struct virtqueue *vq;
unsigned long flags;
unsigned int num;
int err;
@ -399,13 +371,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
goto error_available;
}
/* Allocate and fill out our active queue description */
info = kmalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
err = -ENOMEM;
goto error_kmalloc;
}
num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
if (num == 0) {
err = -ENOENT;
@ -463,13 +428,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
}
vq->priv = info;
info->vq = vq;
spin_lock_irqsave(&vm_dev->lock, flags);
list_add(&info->node, &vm_dev->virtqueues);
spin_unlock_irqrestore(&vm_dev->lock, flags);
return vq;
error_bad_pfn:
@ -481,8 +439,6 @@ error_new_virtqueue:
writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
}
kfree(info);
error_kmalloc:
error_available:
return ERR_PTR(err);
}
@ -627,8 +583,6 @@ static int virtio_mmio_probe(struct platform_device *pdev)
vm_dev->vdev.dev.release = virtio_mmio_release_dev;
vm_dev->vdev.config = &virtio_mmio_config_ops;
vm_dev->pdev = pdev;
INIT_LIST_HEAD(&vm_dev->virtqueues);
spin_lock_init(&vm_dev->lock);
vm_dev->base = devm_platform_ioremap_resource(pdev, 0);
if (IS_ERR(vm_dev->base)) {

View file

@ -2296,6 +2296,10 @@ static inline int virtqueue_add(struct virtqueue *_vq,
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
*
* NB: ENOSPC is a special code that is only returned on an attempt to add a
* buffer to a full VQ. It indicates that some buffers are outstanding and that
* the operation can be retried after some buffers have been used.
*/
int virtqueue_add_sgs(struct virtqueue *_vq,
struct scatterlist *sgs[],

View file

@ -28,19 +28,6 @@ struct virtio_vdpa_device {
struct virtio_device vdev;
struct vdpa_device *vdpa;
u64 features;
/* The lock to protect virtqueue list */
spinlock_t lock;
/* List of virtio_vdpa_vq_info */
struct list_head virtqueues;
};
struct virtio_vdpa_vq_info {
/* the actual virtqueue */
struct virtqueue *vq;
/* the list node for the virtqueues list */
struct list_head node;
};
static inline struct virtio_vdpa_device *
@ -135,9 +122,9 @@ static irqreturn_t virtio_vdpa_config_cb(void *private)
static irqreturn_t virtio_vdpa_virtqueue_cb(void *private)
{
struct virtio_vdpa_vq_info *info = private;
struct virtqueue *vq = private;
return vring_interrupt(0, info->vq);
return vring_interrupt(0, vq);
}
static struct virtqueue *
@ -145,18 +132,15 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
void (*callback)(struct virtqueue *vq),
const char *name, bool ctx)
{
struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
struct vdpa_device *vdpa = vd_get_vdpa(vdev);
struct device *dma_dev;
const struct vdpa_config_ops *ops = vdpa->config;
struct virtio_vdpa_vq_info *info;
bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify;
struct vdpa_callback cb;
struct virtqueue *vq;
u64 desc_addr, driver_addr, device_addr;
/* Assume split virtqueue, switch to packed if necessary */
struct vdpa_vq_state state = {0};
unsigned long flags;
u32 align, max_num, min_num = 1;
bool may_reduce_num = true;
int err;
@ -179,10 +163,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
if (ops->get_vq_ready(vdpa, index))
return ERR_PTR(-ENOENT);
/* Allocate and fill out our active queue description */
info = kmalloc(sizeof(*info), GFP_KERNEL);
if (!info)
return ERR_PTR(-ENOMEM);
if (ops->get_vq_size)
max_num = ops->get_vq_size(vdpa, index);
else
@ -217,7 +197,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
/* Setup virtqueue callback */
cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
cb.private = info;
cb.private = vq;
cb.trigger = NULL;
ops->set_vq_cb(vdpa, index, &cb);
ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
@ -248,13 +228,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
ops->set_vq_ready(vdpa, index, 1);
vq->priv = info;
info->vq = vq;
spin_lock_irqsave(&vd_dev->lock, flags);
list_add(&info->node, &vd_dev->virtqueues);
spin_unlock_irqrestore(&vd_dev->lock, flags);
return vq;
err_vq:
@ -263,7 +236,6 @@ error_new_virtqueue:
ops->set_vq_ready(vdpa, index, 0);
/* VDPA driver should make sure vq is stopeed here */
WARN_ON(ops->get_vq_ready(vdpa, index));
kfree(info);
return ERR_PTR(err);
}
@ -272,20 +244,12 @@ static void virtio_vdpa_del_vq(struct virtqueue *vq)
struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
struct vdpa_device *vdpa = vd_dev->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
struct virtio_vdpa_vq_info *info = vq->priv;
unsigned int index = vq->index;
unsigned long flags;
spin_lock_irqsave(&vd_dev->lock, flags);
list_del(&info->node);
spin_unlock_irqrestore(&vd_dev->lock, flags);
/* Select and deactivate the queue (best effort) */
ops->set_vq_ready(vdpa, index, 0);
vring_del_virtqueue(vq);
kfree(info);
}
static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
@ -502,8 +466,6 @@ static int virtio_vdpa_probe(struct vdpa_device *vdpa)
vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
vd_dev->vdev.config = &virtio_vdpa_config_ops;
vd_dev->vdpa = vdpa;
INIT_LIST_HEAD(&vd_dev->virtqueues);
spin_lock_init(&vd_dev->lock);
vd_dev->vdev.id.device = ops->get_device_id(vdpa);
if (vd_dev->vdev.id.device == 0)

View file

@ -199,7 +199,7 @@ int virtio_device_reset_done(struct virtio_device *dev);
size_t virtio_max_dma_size(const struct virtio_device *vdev);
#define virtio_device_for_each_vq(vdev, vq) \
list_for_each_entry(vq, &vdev->vqs, list)
list_for_each_entry(vq, &(vdev)->vqs, list)
/**
* struct virtio_driver - operations for a virtio I/O driver

View file

@ -47,31 +47,50 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb)
VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false;
}
static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb)
static inline void virtio_vsock_skb_put(struct sk_buff *skb, u32 len)
{
u32 len;
DEBUG_NET_WARN_ON_ONCE(skb->len);
len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
if (len > 0)
if (skb_is_nonlinear(skb))
skb->len = len;
else
skb_put(skb, len);
}
static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
static inline struct sk_buff *
__virtio_vsock_alloc_skb_with_frags(unsigned int header_len,
unsigned int data_len,
gfp_t mask)
{
struct sk_buff *skb;
int err;
if (size < VIRTIO_VSOCK_SKB_HEADROOM)
return NULL;
skb = alloc_skb(size, mask);
skb = alloc_skb_with_frags(header_len, data_len,
PAGE_ALLOC_COSTLY_ORDER, &err, mask);
if (!skb)
return NULL;
skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM);
skb->data_len = data_len;
return skb;
}
static inline struct sk_buff *
virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask)
{
return __virtio_vsock_alloc_skb_with_frags(size, 0, mask);
}
static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
{
if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
return virtio_vsock_alloc_linear_skb(size, mask);
size -= VIRTIO_VSOCK_SKB_HEADROOM;
return __virtio_vsock_alloc_skb_with_frags(VIRTIO_VSOCK_SKB_HEADROOM,
size, mask);
}
static inline void
virtio_vsock_skb_queue_head(struct sk_buff_head *list, struct sk_buff *skb)
{
@ -111,7 +130,12 @@ static inline size_t virtio_vsock_skb_len(struct sk_buff *skb)
return (size_t)(skb_end_pointer(skb) - skb->head);
}
#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4)
/* Dimension the RX SKB so that the entire thing fits exactly into
* a single 4KiB page. This avoids wasting memory due to alloc_skb()
* rounding up to the next page order and also means that we
* don't leave higher-order pages sitting around in the RX queue.
*/
#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE SKB_WITH_OVERHEAD(1024 * 4)
#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL
#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64)

View file

@ -175,9 +175,6 @@ int vringh_complete_multi_user(struct vringh *vrh,
const struct vring_used_elem used[],
unsigned num_used);
/* Pretend we've never seen descriptor (for easy error handling). */
void vringh_abandon_user(struct vringh *vrh, unsigned int num);
/* Do we need to fire the eventfd to notify the other side? */
int vringh_need_notify_user(struct vringh *vrh);
@ -235,10 +232,6 @@ int vringh_getdesc_kern(struct vringh *vrh,
u16 *head,
gfp_t gfp);
ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
const void *src, size_t len);
void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
bool vringh_notify_enable_kern(struct vringh *vrh);
@ -319,13 +312,8 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
struct vringh_kiov *wiov,
const void *src, size_t len);
void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num);
int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len);
bool vringh_notify_enable_iotlb(struct vringh *vrh);
void vringh_notify_disable_iotlb(struct vringh *vrh);
int vringh_need_notify_iotlb(struct vringh *vrh);
#endif /* CONFIG_VHOST_IOTLB */

View file

@ -242,4 +242,32 @@
#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \
struct vhost_features_array)
/* fork_owner values for vhost */
#define VHOST_FORK_OWNER_KTHREAD 0
#define VHOST_FORK_OWNER_TASK 1
/**
* VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device,
* This ioctl must called before VHOST_SET_OWNER.
* Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
*
* @param fork_owner: An 8-bit value that determines the vhost thread mode
*
* When fork_owner is set to VHOST_FORK_OWNER_TASK(default value):
* - Vhost will create vhost worker as tasks forked from the owner,
* inheriting all of the owner's attributes.
*
* When fork_owner is set to VHOST_FORK_OWNER_KTHREAD:
* - Vhost will create vhost workers as kernel threads.
*/
#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8)
/**
* VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device.
* Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
*
* @return: An 8-bit value indicating the current thread mode.
*/
#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8)
#endif

View file

@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
if (IS_ERR(tsk)) {
kfree(vtsk);
return ERR_PTR(PTR_ERR(tsk));
return ERR_CAST(tsk);
}
vtsk->task = tsk;

View file

@ -307,7 +307,7 @@ out_rcu:
static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
{
int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM;
int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
struct scatterlist pkt, *p;
struct virtqueue *vq;
struct sk_buff *skb;
@ -316,7 +316,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
vq = vsock->vqs[VSOCK_VQ_RX];
do {
skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL);
skb = virtio_vsock_alloc_linear_skb(total_len, GFP_KERNEL);
if (!skb)
break;
@ -624,8 +624,9 @@ static void virtio_transport_rx_work(struct work_struct *work)
do {
virtqueue_disable_cb(vq);
for (;;) {
unsigned int len, payload_len;
struct virtio_vsock_hdr *hdr;
struct sk_buff *skb;
unsigned int len;
if (!virtio_transport_more_replies(vsock)) {
/* Stop rx until the device processes already
@ -642,13 +643,22 @@ static void virtio_transport_rx_work(struct work_struct *work)
vsock->rx_buf_nr--;
/* Drop short/long packets */
if (unlikely(len < sizeof(struct virtio_vsock_hdr) ||
if (unlikely(len < sizeof(*hdr) ||
len > virtio_vsock_skb_len(skb))) {
kfree_skb(skb);
continue;
}
virtio_vsock_skb_rx_put(skb);
hdr = virtio_vsock_hdr(skb);
payload_len = le32_to_cpu(hdr->len);
if (unlikely(payload_len > len - sizeof(*hdr))) {
kfree_skb(skb);
continue;
}
if (payload_len)
virtio_vsock_skb_put(skb, payload_len);
virtio_transport_deliver_tap_pkt(skb);
virtio_transport_recv_pkt(&virtio_transport, skb);
}

View file

@ -109,7 +109,8 @@ static int virtio_transport_fill_skb(struct sk_buff *skb,
return __zerocopy_sg_from_iter(info->msg, NULL, skb,
&info->msg->msg_iter, len, NULL);
return memcpy_from_msg(skb_put(skb, len), info->msg, len);
virtio_vsock_skb_put(skb, len);
return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len);
}
static void virtio_transport_init_hdr(struct sk_buff *skb,