// SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Copyright (C) 2006 Rusty Russell IBM Corporation * * Author: Michael S. Tsirkin <mst@redhat.com> * * Inspiration, some code, and most witty comments come from * Documentation/virtual/lguest/lguest.c, by Rusty Russell * * Generic code for virtio server in host kernel.
*/
if (copy_to_user(argp, &s, sizeof(s))) return -EFAULT;
return 0;
}
staticvoid vhost_init_is_le(struct vhost_virtqueue *vq)
{ /* Note for legacy virtio: user_be is initialized at reset time * according to the host endianness. If userspace does not set an * explicit endianness, the default behavior is native endian, as * expected by legacy virtio.
*/
vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
} #else staticvoid vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
}
/* Start polling a file. We add ourselves to file's wait queue. The caller must
* keep a reference to a file until after vhost_poll_stop is called. */ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
{
__poll_t mask;
if (poll->wqh) return 0;
mask = vfs_poll(file, &poll->table); if (mask)
vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask)); if (mask & EPOLLERR) {
vhost_poll_stop(poll); return -EINVAL;
}
return 0;
}
EXPORT_SYMBOL_GPL(vhost_poll_start);
/* Stop polling a file. After this function returns, it becomes safe to drop the
* file reference. You must also flush afterwards. */ void vhost_poll_stop(struct vhost_poll *poll)
{ if (poll->wqh) {
remove_wait_queue(poll->wqh, &poll->wait);
poll->wqh = NULL;
}
}
EXPORT_SYMBOL_GPL(vhost_poll_stop);
staticvoid vhost_worker_queue(struct vhost_worker *worker, struct vhost_work *work)
{ if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { /* We can only add the work to the list after we're * sure it was not in the list. * test_and_set_bit() implies a memory barrier.
*/
llist_add(&work->node, &worker->work_list);
worker->ops->wakeup(worker);
}
}
vhost_worker_queue(worker, &flush.work); /* * Drop mutex in case our worker is killed and it needs to take the * mutex to force cleanup.
*/
mutex_unlock(&worker->mutex);
wait_for_completion(&flush.wait_event);
mutex_lock(&worker->mutex);
}
xa_for_each(&dev->worker_xa, i, worker)
vhost_worker_flush(worker);
}
EXPORT_SYMBOL_GPL(vhost_dev_flush);
/* A lockless hint for busy polling code to exit the loop */ bool vhost_vq_has_work(struct vhost_virtqueue *vq)
{ struct vhost_worker *worker; bool has_work = false;
worker->attachment_cnt -= attach_cnt; if (attach_cnt)
synchronize_rcu(); /* * Finish vhost_worker_flush calls and any other works that snuck in * before the synchronize_rcu.
*/
vhost_run_work_list(worker);
mutex_unlock(&worker->mutex);
}
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->log = NULL;
vq->indirect = NULL;
vq->heads = NULL;
vq->nheads = NULL;
vq->dev = dev;
mutex_init(&vq->mutex);
vhost_vq_reset(dev, vq); if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
EPOLLIN, dev, vq);
}
}
EXPORT_SYMBOL_GPL(vhost_dev_init);
/* Caller should have device mutex */ long vhost_dev_check_owner(struct vhost_dev *dev)
{ /* Are you the owner? If not, I don't think you mean to do that */ return dev->mm == current->mm ? 0 : -EPERM;
}
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
/* * Bypass attachment_cnt check in __vhost_worker_flush: * Temporarily change it to INT_MAX to bypass the check
*/
saved_cnt = worker->attachment_cnt;
worker->attachment_cnt = INT_MAX;
__vhost_worker_flush(worker);
worker->attachment_cnt = saved_cnt;
mutex_unlock(&worker->mutex);
return attach.ret;
}
/* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev)
{ return dev->mm;
}
EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
staticvoid vhost_attach_mm(struct vhost_dev *dev)
{ /* No owner, become one */ if (dev->use_worker) {
dev->mm = get_task_mm(current);
} else { /* vDPA device does not use worker thread, so there's * no need to hold the address space for mm. This helps * to avoid deadlock in the case of mmap() which may * hold the refcnt of the file and depends on release * method to remove vma.
*/
dev->mm = current->mm;
mmgrab(dev->mm);
}
}
staticvoid vhost_detach_mm(struct vhost_dev *dev)
{ if (!dev->mm) return;
if (dev->use_worker)
mmput(dev->mm); else
mmdrop(dev->mm);
for (i = 0; i < dev->nvqs; i++)
rcu_assign_pointer(dev->vqs[i]->worker, NULL); /* * Free the default worker we created and cleanup workers userspace * created but couldn't clean up (it forgot or crashed).
*/
xa_for_each(&dev->worker_xa, i, worker)
vhost_worker_destroy(dev, worker);
xa_destroy(&dev->worker_xa);
}
if (!old_worker) {
mutex_unlock(&vq->mutex);
mutex_unlock(&worker->mutex); return;
}
mutex_unlock(&vq->mutex);
mutex_unlock(&worker->mutex);
/* * Take the worker mutex to make sure we see the work queued from * device wide flushes which doesn't use RCU for execution.
*/
mutex_lock(&old_worker->mutex); if (old_worker->killed) {
mutex_unlock(&old_worker->mutex); return;
}
/* * We don't want to call synchronize_rcu for every vq during setup * because it will slow down VM startup. If we haven't done * VHOST_SET_VRING_KICK and not done the driver specific * SET_ENDPOINT/RUNNING then we can skip the sync since there will * not be any works queued for scsi and net.
*/
mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq) && !vq->kick) {
mutex_unlock(&vq->mutex);
old_worker->attachment_cnt--;
mutex_unlock(&old_worker->mutex); /* * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID. * Warn if it adds support for multiple workers but forgets to * handle the early queueing case.
*/
WARN_ON(!old_worker->attachment_cnt &&
!llist_empty(&old_worker->work_list)); return;
}
mutex_unlock(&vq->mutex);
/* Make sure new vq queue/flush/poll calls see the new worker */
synchronize_rcu(); /* Make sure whatever was queued gets run */
__vhost_worker_flush(old_worker);
old_worker->attachment_cnt--;
mutex_unlock(&old_worker->mutex);
}
/* Caller must have device mutex */ staticint vhost_vq_attach_worker(struct vhost_virtqueue *vq, struct vhost_vring_worker *info)
{ unsignedlong index = info->worker_id; struct vhost_dev *dev = vq->dev; struct vhost_worker *worker;
mutex_lock(&worker->mutex); if (worker->attachment_cnt || worker->killed) {
mutex_unlock(&worker->mutex); return -EBUSY;
} /* * A flush might have raced and snuck in before attachment_cnt was set * to zero. Make sure flushes are flushed from the queue before * freeing.
*/
__vhost_worker_flush(worker);
mutex_unlock(&worker->mutex);
/* Caller must have device mutex */ long vhost_worker_ioctl(struct vhost_dev *dev, unsignedint ioctl, void __user *argp)
{ struct vhost_vring_worker ring_worker; struct vhost_worker_state state; struct vhost_worker *worker; struct vhost_virtqueue *vq; long ret;
u32 idx;
if (!dev->use_worker) return -EINVAL;
if (!vhost_dev_has_owner(dev)) return -EINVAL;
ret = vhost_dev_check_owner(dev); if (ret) return ret;
switch (ioctl) { /* dev worker ioctls */ case VHOST_NEW_WORKER: /* * vhost_tasks will account for worker threads under the parent's * NPROC value but kthreads do not. To avoid userspace overflowing * the system with worker threads fork_owner must be true.
*/ if (!dev->fork_owner) return -EFAULT;
ret = vhost_new_worker(dev, &state); if (!ret && copy_to_user(argp, &state, sizeof(state)))
ret = -EFAULT; return ret; case VHOST_FREE_WORKER: if (copy_from_user(&state, argp, sizeof(state))) return -EFAULT; return vhost_free_worker(dev, &state); /* vring worker ioctls */ case VHOST_ATTACH_VRING_WORKER: case VHOST_GET_VRING_WORKER: break; default: return -ENOIOCTLCMD;
}
ret = vhost_get_vq_from_user(dev, argp, &vq, &idx); if (ret) return ret;
switch (ioctl) { case VHOST_ATTACH_VRING_WORKER: if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
ret = -EFAULT; break;
}
ret = vhost_vq_attach_worker(vq, &ring_worker); break; case VHOST_GET_VRING_WORKER:
worker = rcu_dereference_check(vq->worker,
lockdep_is_held(&dev->mutex)); if (!worker) {
ret = -EINVAL; break;
}
/* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev)
{ struct vhost_worker *worker; int err, i;
/* Is there an owner already? */ if (vhost_dev_has_owner(dev)) {
err = -EBUSY; goto err_mm;
}
vhost_attach_mm(dev);
err = vhost_dev_alloc_iovecs(dev); if (err) goto err_iovecs;
if (dev->use_worker) { /* * This should be done last, because vsock can queue work * before VHOST_SET_OWNER so it simplifies the failure path * below since we don't have to worry about vsock queueing * while we free the worker.
*/
worker = vhost_worker_create(dev); if (!worker) {
err = -ENOMEM; goto err_worker;
}
for (i = 0; i < dev->nvqs; i++)
__vhost_vq_attach_worker(dev->vqs[i], worker);
}
/* Caller should have device mutex */ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
{ int i;
vhost_dev_cleanup(dev);
dev->fork_owner = fork_from_owner_default;
dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running.
*/ for (i = 0; i < dev->nvqs; ++i)
dev->vqs[i]->umem = umem;
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
void vhost_dev_stop(struct vhost_dev *dev)
{ int i;
for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
vhost_poll_stop(&dev->vqs[i]->poll);
}
void vhost_dev_cleanup(struct vhost_dev *dev)
{ int i;
for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i]->error_ctx)
eventfd_ctx_put(dev->vqs[i]->error_ctx); if (dev->vqs[i]->kick)
fput(dev->vqs[i]->kick); if (dev->vqs[i]->call_ctx.ctx)
eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx);
vhost_vq_reset(dev, dev->vqs[i]);
}
vhost_dev_free_iovecs(dev); if (dev->log_ctx)
eventfd_ctx_put(dev->log_ctx);
dev->log_ctx = NULL; /* No one will access memory at this point */
vhost_iotlb_free(dev->umem);
dev->umem = NULL;
vhost_iotlb_free(dev->iotlb);
dev->iotlb = NULL;
vhost_clear_msg(dev);
wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
vhost_workers_free(dev);
vhost_detach_mm(dev);
}
EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
/* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ staticbool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem, int log_all)
{ int i;
for (i = 0; i < d->nvqs; ++i) { bool ok; bool log;
mutex_lock(&d->vqs[i]->mutex);
log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL); /* If ring is inactive, will check when it's enabled. */ if (d->vqs[i]->private_data)
ok = vq_memory_access_ok(d->vqs[i]->log_base,
umem, log); else
ok = true;
mutex_unlock(&d->vqs[i]->mutex); if (!ok) returnfalse;
} returntrue;
}
staticint translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, struct iovec iov[], int iov_size, int access);
if (!vq->iotlb) return __copy_to_user(to, from, size); else { /* This function should be called after iotlb * prefetch, which means we're sure that all vq * could be access through iotlb. So -EAGAIN should * not happen in this case.
*/ struct iov_iter t; void __user *uaddr = vhost_vq_meta_fetch(vq,
(u64)(uintptr_t)to, size,
VHOST_ADDR_USED);
if (uaddr) return __copy_to_user(uaddr, from, size);
ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
ARRAY_SIZE(vq->iotlb_iov),
VHOST_ACCESS_WO); if (ret < 0) goto out;
iov_iter_init(&t, ITER_DEST, vq->iotlb_iov, ret, size);
ret = copy_to_iter(from, size, &t); if (ret == size)
ret = 0;
}
out: return ret;
}
if (!vq->iotlb) return __copy_from_user(to, from, size); else { /* This function should be called after iotlb * prefetch, which means we're sure that vq * could be access through iotlb. So -EAGAIN should * not happen in this case.
*/ void __user *uaddr = vhost_vq_meta_fetch(vq,
(u64)(uintptr_t)from, size,
VHOST_ADDR_DESC); struct iov_iter f;
if (uaddr) return __copy_from_user(to, uaddr, size);
ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
ARRAY_SIZE(vq->iotlb_iov),
VHOST_ACCESS_RO); if (ret < 0) {
vq_err(vq, "IOTLB translation failure: uaddr " "%p size 0x%llx\n", from,
(unsignedlonglong) size); goto out;
}
iov_iter_init(&f, ITER_SOURCE, vq->iotlb_iov, ret, size);
ret = copy_from_iter(to, size, &f); if (ret == size)
ret = 0;
}
out: return ret;
}
staticvoid __user *__vhost_get_user_slow(struct vhost_virtqueue *vq, void __user *addr, unsignedint size, int type)
{ int ret;
/* This function should be called after iotlb * prefetch, which means we're sure that vq * could be access through iotlb. So -EAGAIN should * not happen in this case.
*/ staticinlinevoid __user *__vhost_get_user(struct vhost_virtqueue *vq, void __user *addr, unsignedint size, int type)
{ void __user *uaddr = vhost_vq_meta_fetch(vq,
(u64)(uintptr_t)addr, size, type); if (uaddr) return uaddr;
staticvoid vhost_dev_lock_vqs(struct vhost_dev *d)
{ int i = 0; for (i = 0; i < d->nvqs; ++i)
mutex_lock_nested(&d->vqs[i]->mutex, i);
}
staticvoid vhost_dev_unlock_vqs(struct vhost_dev *d)
{ int i = 0; for (i = 0; i < d->nvqs; ++i)
mutex_unlock(&d->vqs[i]->mutex);
}
staticinlineint vhost_get_avail_idx(struct vhost_virtqueue *vq)
{
__virtio16 idx; int r;
r = vhost_get_avail(vq, idx, &vq->avail->idx); if (unlikely(r < 0)) {
vq_err(vq, "Failed to access available index at %p (%d)\n",
&vq->avail->idx, r); return r;
}
/* Check it isn't doing very strange thing with available indexes */
vq->avail_idx = vhost16_to_cpu(vq, idx); if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
vq_err(vq, "Invalid available index change from %u to %u",
vq->last_avail_idx, vq->avail_idx); return -EINVAL;
}
/* We're done if there is nothing new */ if (vq->avail_idx == vq->last_avail_idx) return 0;
/* * We updated vq->avail_idx so we need a memory barrier between * the index read above and the caller reading avail ring entries.
*/
smp_rmb(); return 1;
}
ret = copy_from_iter(&type, sizeof(type), from); if (ret != sizeof(type)) {
ret = -EINVAL; goto done;
}
switch (type) { case VHOST_IOTLB_MSG: /* There maybe a hole after type for V1 message type, * so skip it here.
*/
offset = offsetof(struct vhost_msg, iotlb) - sizeof(int); break; case VHOST_IOTLB_MSG_V2: if (vhost_backend_has_feature(dev->vqs[0],
VHOST_BACKEND_F_IOTLB_ASID)) {
ret = copy_from_iter(&asid, sizeof(asid), from); if (ret != sizeof(asid)) {
ret = -EINVAL; goto done;
}
offset = 0;
} else
offset = sizeof(__u32); break; default:
ret = -EINVAL; goto done;
}
iov_iter_advance(from, offset);
ret = copy_from_iter(&msg, sizeof(msg), from); if (ret != sizeof(msg)) {
ret = -EINVAL; goto done;
}
if (msg.type == VHOST_IOTLB_UPDATE && msg.size == 0) {
ret = -EINVAL; goto done;
}
if (dev->msg_handler)
ret = dev->msg_handler(dev, asid, &msg); else
ret = vhost_process_iotlb_msg(dev, asid, &msg); if (ret) {
ret = -EFAULT; goto done;
}
while (1) { if (!noblock)
prepare_to_wait(&dev->wait, &wait,
TASK_INTERRUPTIBLE);
node = vhost_dequeue_msg(dev, &dev->read_list); if (node) break; if (noblock) {
ret = -EAGAIN; break;
} if (signal_pending(current)) {
ret = -ERESTARTSYS; break;
} if (!dev->iotlb) {
ret = -EBADFD; break;
}
schedule();
}
if (!noblock)
finish_wait(&dev->wait, &wait);
if (node) { struct vhost_iotlb_msg *msg; void *start = &node->msg;
/* Can we log writes? */ /* Caller should have device mutex but not vq mutex */ bool vhost_log_access_ok(struct vhost_dev *dev)
{ return memory_access_ok(dev, dev->umem, 1);
}
EXPORT_SYMBOL_GPL(vhost_log_access_ok);
staticbool vq_log_used_access_ok(struct vhost_virtqueue *vq, void __user *log_base, bool log_used,
u64 log_addr)
{ /* If an IOTLB device is present, log_addr is a GIOVA that
* will never be logged by log_used(). */ if (vq->iotlb) returntrue;
/* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ staticbool vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
{ return vq_memory_access_ok(log_base, vq->umem,
vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
}
/* Can we start vq? */ /* Caller should have vq mutex and device mutex */ bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
{ if (!vq_log_access_ok(vq, vq->log_base)) returnfalse;
newumem = iotlb_alloc(); if (!newumem) {
kvfree(newmem); return -ENOMEM;
}
for (region = newmem->regions;
region < newmem->regions + mem.nregions;
region++) { if (vhost_iotlb_add_range(newumem,
region->guest_phys_addr,
region->guest_phys_addr +
region->memory_size - 1,
region->userspace_addr,
VHOST_MAP_RW)) goto err;
}
if (!memory_access_ok(d, newumem, 0)) goto err;
oldumem = d->umem;
d->umem = newumem;
/* All memory accesses are done under some VQ mutex. */ for (i = 0; i < d->nvqs; ++i) {
mutex_lock(&d->vqs[i]->mutex);
d->vqs[i]->umem = newumem;
mutex_unlock(&d->vqs[i]->mutex);
}
if (copy_from_user(&a, argp, sizeof a)) return -EFAULT; if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) return -EOPNOTSUPP;
/* For 32bit, verify that the top 32bits of the user
data are set to zero. */ if ((u64)(unsignedlong)a.desc_user_addr != a.desc_user_addr ||
(u64)(unsignedlong)a.used_user_addr != a.used_user_addr ||
(u64)(unsignedlong)a.avail_user_addr != a.avail_user_addr) return -EFAULT;
/* Make sure it's safe to cast pointers to vring types. */
BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
(a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
(a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) return -EINVAL;
/* We only verify access here if backend is configured. * If it is not, we don't as size might not have been setup.
* We will verify when backend is configured. */ if (vq->private_data) { if (!vq_access_ok(vq, vq->num,
(void __user *)(unsignedlong)a.desc_user_addr,
(void __user *)(unsignedlong)a.avail_user_addr,
(void __user *)(unsignedlong)a.used_user_addr)) return -EINVAL;
/* Also validate log access for used ring if enabled. */ if (!vq_log_used_access_ok(vq, vq->log_base,
a.flags & (0x1 << VHOST_VRING_F_LOG),
a.log_guest_addr)) return -EINVAL;
}
/* Caller must have device mutex */ long vhost_dev_ioctl(struct vhost_dev *d, unsignedint ioctl, void __user *argp)
{ struct eventfd_ctx *ctx;
u64 p; long r; int i, fd;
/* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) {
r = vhost_dev_set_owner(d); goto done;
}
#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL if (ioctl == VHOST_SET_FORK_FROM_OWNER) { /* Only allow modification before owner is set */ if (vhost_dev_has_owner(d)) {
r = -EBUSY; goto done;
}
u8 fork_owner_val;
if (get_user(fork_owner_val, (u8 __user *)argp)) {
r = -EFAULT; goto done;
} if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
r = -EINVAL; goto done;
}
d->fork_owner = !!fork_owner_val;
r = 0; goto done;
} if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
u8 fork_owner_val = d->fork_owner;
if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
r = -EINVAL; goto done;
} if (put_user(fork_owner_val, (u8 __user *)argp)) {
r = -EFAULT; goto done;
}
r = 0; goto done;
} #endif
/* You must be the owner to do anything else */
r = vhost_dev_check_owner(d); if (r) goto done;
switch (ioctl) { case VHOST_SET_MEM_TABLE:
r = vhost_set_memory(d, argp); break; case VHOST_SET_LOG_BASE: if (copy_from_user(&p, argp, sizeof p)) {
r = -EFAULT; break;
} if ((u64)(unsignedlong)p != p) {
r = -EFAULT; break;
} for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsignedlong)p;
vq = d->vqs[i];
mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(vq, base))
r = -EFAULT; else
vq->log_base = base;
mutex_unlock(&vq->mutex);
} break; case VHOST_SET_LOG_FD:
r = get_user(fd, (int __user *)argp); if (r < 0) break;
ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd); if (IS_ERR(ctx)) {
r = PTR_ERR(ctx); break;
}
swap(ctx, d->log_ctx); for (i = 0; i < d->nvqs; ++i) {
mutex_lock(&d->vqs[i]->mutex);
d->vqs[i]->log_ctx = d->log_ctx;
mutex_unlock(&d->vqs[i]->mutex);
} if (ctx)
eventfd_ctx_put(ctx); break; default:
r = -ENOIOCTLCMD; break;
}
done: return r;
}
EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
/* TODO: This is really inefficient. We need something like get_user() * (instruction directly accesses the data, with an exception table entry * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst.
*/ staticint set_bit_to_user(int nr, void __user *addr)
{ unsignedlong log = (unsignedlong)addr; struct page *page; void *base; int bit = nr + (log % PAGE_SIZE) * 8; int r;
r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r;
BUG_ON(r != 1);
base = kmap_atomic(page);
set_bit(bit, base);
kunmap_atomic(base);
unpin_user_pages_dirty_lock(&page, 1, true); return 0;
}
while (len) {
min = len; /* More than one GPAs can be mapped into a single HVA. So * iterate all possible umems here to be safe.
*/
list_for_each_entry(u, &umem->list, link) { if (u->addr > hva - 1 + len ||
u->addr - 1 + u->size < hva) continue;
start = max(u->addr, hva);
end = min(u->addr - 1 + u->size, hva - 1 + len);
l = end - start + 1;
r = log_write(vq->log_base,
u->start + start - u->addr,
l); if (r < 0) return r;
hit = true;
min = min(l, min);
}
if (!hit) return -EFAULT;
len -= min;
hva += min;
}
return 0;
}
staticint log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
{ struct iovec *iov = vq->log_iov; int i, ret;
if (!vq->iotlb) return log_write(vq->log_base, vq->log_addr + used_offset, len);
ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
len, iov, 64, VHOST_ACCESS_WO); if (ret < 0) return ret;
for (i = 0; i < ret; i++) {
ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
iov[i].iov_len); if (ret) return ret;
}
return 0;
}
/* * vhost_log_write() - Log in dirty page bitmap * @vq: vhost virtqueue. * @log: Array of dirty memory in GPA. * @log_num: Size of vhost_log arrary. * @len: The total length of memory buffer to log in the dirty bitmap. * Some drivers may only partially use pages shared via the last * vring descriptor (i.e. vhost-net RX buffer). * Use (len == U64_MAX) to indicate the driver would log all * pages of vring descriptors. * @iov: Array of dirty memory in HVA. * @count: Size of iovec array.
*/ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsignedint log_num, u64 len, struct iovec *iov, int count)
{ int i, r;
/* Make sure data written is seen before log. */
smp_wmb();
if (vq->iotlb) { for (i = 0; i < count; i++) {
r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
iov[i].iov_len); if (r < 0) return r;
} return 0;
}
for (i = 0; i < log_num; ++i) {
u64 l = min(log[i].len, len);
r = log_write(vq->log_base, log[i].addr, l); if (r < 0) return r;
if (len != U64_MAX)
len -= l;
}
if (vq->log_ctx)
eventfd_signal(vq->log_ctx);
return 0;
}
EXPORT_SYMBOL_GPL(vhost_log_write);
staticint vhost_update_used_flags(struct vhost_virtqueue *vq)
{ void __user *used; if (vhost_put_used_flags(vq)) return -EFAULT; if (unlikely(vq->log_used)) { /* Make sure the flag is seen before log. */
smp_wmb(); /* Log used flag write. */
used = &vq->used->flags;
log_used(vq, (used - (void __user *)vq->used), sizeof vq->used->flags); if (vq->log_ctx)
eventfd_signal(vq->log_ctx);
} return 0;
}
staticint vhost_update_avail_event(struct vhost_virtqueue *vq)
{ if (vhost_put_avail_event(vq)) return -EFAULT; if (unlikely(vq->log_used)) { void __user *used; /* Make sure the event is seen before log. */
smp_wmb(); /* Log avail event write */
used = vhost_avail_event(vq);
log_used(vq, (used - (void __user *)vq->used), sizeof *vhost_avail_event(vq)); if (vq->log_ctx)
eventfd_signal(vq->log_ctx);
} return 0;
}
int vhost_vq_init_access(struct vhost_virtqueue *vq)
{
__virtio16 last_used_idx; int r; bool is_le = vq->is_le;
if (!vq->private_data) return 0;
vhost_init_is_le(vq);
r = vhost_update_used_flags(vq); if (r) goto err;
vq->signalled_used_valid = false; if (!vq->iotlb &&
!access_ok(&vq->used->idx, sizeof vq->used->idx)) {
r = -EFAULT; goto err;
}
r = vhost_get_used_idx(vq, &last_used_idx); if (r) {
vq_err(vq, "Can't access used idx at %p\n",
&vq->used->idx); goto err;
}
vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx); return 0;
if (ret == -EAGAIN)
vhost_iotlb_miss(vq, addr, access); return ret;
}
/* Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain,
* or -1U if we're at the end. */ staticunsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
{ unsignedint next;
/* If this descriptor says it doesn't chain, we're done. */ if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT))) return -1U;
/* Check they're not leading us off end of descriptors. */
next = vhost16_to_cpu(vq, READ_ONCE(desc->next)); return next;
}
/* Sanity check */ if (unlikely(len % sizeof desc)) {
vq_err(vq, "Invalid length in indirect descriptor: " "len 0x%llx not multiple of 0x%zx\n",
(unsignedlonglong)len, sizeof desc); return -EINVAL;
}
ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
UIO_MAXIOV, VHOST_ACCESS_RO); if (unlikely(ret < 0)) { if (ret != -EAGAIN)
vq_err(vq, "Translation failure %d in indirect.\n", ret); return ret;
}
iov_iter_init(&from, ITER_SOURCE, vq->indirect, ret, len);
count = len / sizeof desc; /* Buffers are chained via a 16 bit next field, so
* we can have at most 2^16 of these. */ if (unlikely(count > USHRT_MAX + 1)) {
vq_err(vq, "Indirect buffer length too big: %d\n",
indirect->len); return -E2BIG;
}
do { unsigned iov_count = *in_num + *out_num; if (unlikely(++found > count)) {
vq_err(vq, "Loop detected: last one at %u " "indirect size %u\n",
i, count); return -EINVAL;
} if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); return -EINVAL;
} if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); return -EINVAL;
}
ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
vhost32_to_cpu(vq, desc.len), iov + iov_count,
iov_size - iov_count, access); if (unlikely(ret < 0)) { if (ret != -EAGAIN)
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.43Bemerkung:
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.