// SPDX-License-Identifier: GPL-2.0-only /* * VFIO core * * Copyright (C) 2012 Red Hat, Inc. All rights reserved. * Author: Alex Williamson <alex.williamson@redhat.com> * * Derived from original vfio: * Copyright 2010 Cisco Systems, Inc. All rights reserved. * Author: Tom Lyon, pugs@cisco.com
*/
staticstruct vfio { structclass *device_class; struct ida device_ida; struct vfsmount *vfs_mount; int fs_count;
} vfio;
#ifdef CONFIG_VFIO_NOIOMMU bool vfio_noiommu __read_mostly;
module_param_named(enable_unsafe_noiommu_mode,
vfio_noiommu, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); #endif
/* * Atomically acquire a singleton object in the xarray for this set_id
*/
xa_lock(&vfio_device_set_xa);
dev_set = xa_load(&vfio_device_set_xa, idx); if (dev_set) goto found_get_ref;
xa_unlock(&vfio_device_set_xa);
/* * Allocate and initialize vfio_device so it can be registered to vfio * core. * * Drivers should use the wrapper vfio_alloc_device() for allocation. * @size is the size of the structure to be allocated, including any * private data used by the driver. * * Driver may provide an @init callback to cover device private data. * * Use vfio_put_device() to release the structure after success return.
*/ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, conststruct vfio_device_ops *ops)
{ struct vfio_device *device; int ret;
if (WARN_ON(size < sizeof(struct vfio_device))) return ERR_PTR(-EINVAL);
device = kvzalloc(size, GFP_KERNEL); if (!device) return ERR_PTR(-ENOMEM);
ret = vfio_init_device(device, dev, ops); if (ret) goto out_free; return device;
staticstruct inode *vfio_fs_inode_new(void)
{ struct inode *inode; int ret;
ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); if (ret) return ERR_PTR(ret);
inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); if (IS_ERR(inode))
simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
return inode;
}
/* * Initialize a vfio_device so it can be registered to vfio core.
*/ staticint vfio_init_device(struct vfio_device *device, struct device *dev, conststruct vfio_device_ops *ops)
{ int ret;
ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); if (ret < 0) {
dev_dbg(dev, "Error to alloc index\n"); return ret;
}
/* * If the driver doesn't specify a set then the device is added to a * singleton set just for itself.
*/ if (!device->dev_set)
vfio_assign_device_set(device, device);
ret = dev_set_name(&device->device, "vfio%d", device->index); if (ret) return ret;
ret = vfio_device_set_group(device, type); if (ret) return ret;
/* * VFIO always sets IOMMU_CACHE because we offer no way for userspace to * restore cache coherency. It has to be checked here because it is only * valid for cases where we are using iommu groups.
*/ if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
!device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
ret = -EINVAL; goto err_out;
}
ret = vfio_device_add(device); if (ret) goto err_out;
/* Refcounting can't start until the driver calls register */
refcount_set(&device->refcount, 1);
int vfio_register_group_dev(struct vfio_device *device)
{ return __vfio_register_dev(device, VFIO_IOMMU);
}
EXPORT_SYMBOL_GPL(vfio_register_group_dev);
/* * Register a virtual device without IOMMU backing. The user of this * device must not be able to directly trigger unmediated DMA.
*/ int vfio_register_emulated_iommu_dev(struct vfio_device *device)
{ return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
}
EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
/* * Decrement the device reference count and wait for the device to be
* removed. Open file descriptors for the device... */ void vfio_unregister_group_dev(struct vfio_device *device)
{ unsignedint i = 0; bool interrupted = false; long rc;
/* * Prevent new device opened by userspace via the * VFIO_GROUP_GET_DEVICE_FD in the group path.
*/
vfio_device_group_unregister(device);
/* * Balances vfio_device_add() in register path, also prevents * new device opened by userspace in the cdev path.
*/
vfio_device_del(device);
vfio_device_put_registration(device);
rc = try_wait_for_completion(&device->comp); while (rc <= 0) { if (device->ops->request)
device->ops->request(device, i++);
if (interrupted) {
rc = wait_for_completion_timeout(&device->comp,
HZ * 10);
} else {
rc = wait_for_completion_interruptible_timeout(
&device->comp, HZ * 10); if (rc < 0) {
interrupted = true;
dev_warn(device->dev, "Device is currently in use, task" " \"%s\" (%d) " "blocked until device is released",
current->comm, task_pid_nr(current));
}
}
}
/* true if the vfio_device has open_device() called but not close_device() */ staticbool vfio_assert_device_open(struct vfio_device *device)
{ return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
}
if (device->ops->close_device)
device->ops->close_device(device); if (iommufd)
vfio_df_iommufd_unbind(df); else
vfio_device_group_unuse_iommu(device);
module_put(device->dev->driver->owner);
}
int vfio_df_open(struct vfio_device_file *df)
{ struct vfio_device *device = df->device; int ret = 0;
lockdep_assert_held(&device->dev_set->lock);
/* * Only the group path allows the device to be opened multiple * times. The device cdev path doesn't have a secure way for it.
*/ if (device->open_count != 0 && !df->group) return -EINVAL;
device->open_count++; if (device->open_count == 1) {
ret = vfio_df_device_first_open(df); if (ret)
device->open_count--;
}
/* * Arcs touching optional and unsupported states are skipped over. The * driver will instead see an arc from the original state to the next * logical state, as per the above comment.
*/
*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; while ((state_flags_table[*next_fsm] & device->migration_flags) !=
state_flags_table[*next_fsm])
*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
/* * Convert the drivers's struct file into a FD number and return it to userspace
*/ staticint vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, struct vfio_device_feature_mig_state *mig)
{ int ret; int fd;
fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) {
ret = fd; goto out_fput;
}
mig->data_fd = fd; if (copy_to_user(arg, mig, sizeof(*mig))) {
ret = -EFAULT; goto out_put_unused;
}
fd_install(fd, filp); return 0;
for (i = 0; i < nnodes; i++) { if (copy_from_user(&range, &ranges[i], sizeof(range))) {
ret = -EFAULT; goto end;
} if (!IS_ALIGNED(range.iova, control.page_size) ||
!IS_ALIGNED(range.length, control.page_size)) {
ret = -EINVAL; goto end;
}
if (check_add_overflow(range.iova, range.length, &iova_end) ||
iova_end > ULONG_MAX) {
ret = -EOVERFLOW; goto end;
}
nodes[i].start = range.iova;
nodes[i].last = range.iova + range.length - 1; if (interval_tree_iter_first(&root, nodes[i].start,
nodes[i].last)) { /* Range overlapping */
ret = -EINVAL; goto end;
}
interval_tree_insert(nodes + i, &root);
}
ret = device->log_ops->log_start(device, &root, nnodes,
&control.page_size); if (ret) goto end;
if (copy_to_user(arg, &control, sizeof(control))) {
ret = -EFAULT;
device->log_ops->log_stop(device);
}
/* GET & SET are mutually exclusive except with PROBE */ if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
(feature.flags & VFIO_DEVICE_FEATURE_SET) &&
(feature.flags & VFIO_DEVICE_FEATURE_GET)) return -EINVAL;
if (file->f_op != &vfio_device_fops) return NULL; return df->device;
}
/** * vfio_file_is_valid - True if the file is valid vfio file * @file: VFIO group file or VFIO device file
*/ bool vfio_file_is_valid(struct file *file)
{ return vfio_group_from_file(file) ||
vfio_device_from_file(file);
}
EXPORT_SYMBOL_GPL(vfio_file_is_valid);
/** * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file * is always CPU cache coherent * @file: VFIO group file or VFIO device file * * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop * bit in DMA transactions. A return of false indicates that the user has * rights to access additional instructions such as wbinvd on x86.
*/ bool vfio_file_enforced_coherent(struct file *file)
{ struct vfio_device *device; struct vfio_group *group;
group = vfio_group_from_file(file); if (group) return vfio_group_enforced_coherent(group);
device = vfio_device_from_file(file); if (device) return device_iommu_capable(device->dev,
IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
/* * The kvm is first recorded in the vfio_device_file, and will * be propagated to vfio_device::kvm when the file is bound to * iommufd successfully in the vfio device cdev path.
*/
spin_lock(&df->kvm_ref_lock);
df->kvm = kvm;
spin_unlock(&df->kvm_ref_lock);
}
/** * vfio_file_set_kvm - Link a kvm with VFIO drivers * @file: VFIO group file or VFIO device file * @kvm: KVM to link * * When a VFIO device is first opened the KVM will be available in * device->kvm if one was associated with the file.
*/ void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
{ struct vfio_group *group;
group = vfio_group_from_file(file); if (group)
vfio_group_set_kvm(group, kvm);
if (vfio_device_from_file(file))
vfio_device_file_set_kvm(file, kvm);
}
EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
/* * Sub-module support
*/ /* * Helper for managing a buffer of info chain capabilities, allocate or * reallocate a buffer with additional @size, filling in @id and @version * of the capability. A pointer to the new capability is returned. * * NB. The chain is based at the head of the buffer, so new entries are * added to the tail, vfio_info_cap_shift() should be called to fixup the * next offsets prior to copying to the user buffer.
*/ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
size_t size, u16 id, u16 version)
{ void *buf; struct vfio_info_cap_header *header, *tmp;
/* Ensure that the next capability struct will be aligned */
size = ALIGN(size, sizeof(u64));
/* * Pin contiguous user pages and return their associated host pages for local * domain only. * @device [in] : device * @iova [in] : starting IOVA of user pages to be pinned. * @npage [in] : count of pages to be pinned. This count should not * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. * @prot [in] : protection flags * @pages[out] : array of host pages * Return error or number of pages pinned. * * A driver may only call this function if the vfio_device was created * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
*/ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, int npage, int prot, struct page **pages)
{ /* group->container cannot change while a vfio device is open */ if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) return -EINVAL; if (!device->ops->dma_unmap) return -EINVAL; if (vfio_device_has_container(device)) return vfio_device_container_pin_pages(device, iova,
npage, prot, pages); if (device->iommufd_access) { int ret;
if (iova > ULONG_MAX) return -EINVAL; /* * VFIO ignores the sub page offset, npages is from the start of * a PAGE_SIZE chunk of IOVA. The caller is expected to recover * the sub page offset by doing: * pages[0] + (iova % PAGE_SIZE)
*/
ret = iommufd_access_pin_pages(
device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
npage * PAGE_SIZE, pages,
(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); if (ret) return ret; return npage;
} return -EINVAL;
}
EXPORT_SYMBOL(vfio_pin_pages);
/* * Unpin contiguous host pages for local domain only. * @device [in] : device * @iova [in] : starting address of user pages to be unpinned. * @npage [in] : count of pages to be unpinned. This count should not * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
*/ void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
{ if (WARN_ON(!vfio_assert_device_open(device))) return; if (WARN_ON(!device->ops->dma_unmap)) return;
if (vfio_device_has_container(device)) {
vfio_device_container_unpin_pages(device, iova, npage); return;
} if (device->iommufd_access) { if (WARN_ON(iova > ULONG_MAX)) return;
iommufd_access_unpin_pages(device->iommufd_access,
ALIGN_DOWN(iova, PAGE_SIZE),
npage * PAGE_SIZE); return;
}
}
EXPORT_SYMBOL(vfio_unpin_pages);
/* * This interface allows the CPUs to perform some sort of virtual DMA on * behalf of the device. * * CPUs read/write from/into a range of IOVAs pointing to user space memory * into/from a kernel buffer. * * As the read/write of user space memory is conducted via the CPUs and is * not a real device DMA, it is not necessary to pin the user space memory. * * @device [in] : VFIO device * @iova [in] : base IOVA of a user space buffer * @data [in] : pointer to kernel buffer * @len [in] : kernel buffer length * @write : indicate read or write * Return error code on failure or 0 on success.
*/ int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
size_t len, bool write)
{ if (!data || len <= 0 || !vfio_assert_device_open(device)) return -EINVAL;
if (vfio_device_has_container(device)) return vfio_device_container_dma_rw(device, iova,
data, len, write);
if (device->iommufd_access) { unsignedint flags = 0;
if (iova > ULONG_MAX) return -EINVAL;
/* VFIO historically tries to auto-detect a kthread */ if (!current->mm)
flags |= IOMMUFD_ACCESS_RW_KTHREAD; if (write)
flags |= IOMMUFD_ACCESS_RW_WRITE; return iommufd_access_rw(device->iommufd_access, iova, data,
len, flags);
} return -EINVAL;
}
EXPORT_SYMBOL(vfio_dma_rw);
/* * Module/class support
*/ staticint __init vfio_init(void)
{ int ret;
ida_init(&vfio.device_ida);
ret = vfio_group_init(); if (ret) return ret;
ret = vfio_virqfd_init(); if (ret) goto err_virqfd;
/* /sys/class/vfio-dev/vfioX */
vfio.device_class = class_create("vfio-dev"); if (IS_ERR(vfio.device_class)) {
ret = PTR_ERR(vfio.device_class); goto err_dev_class;
}
ret = vfio_cdev_init(vfio.device_class); if (ret) goto err_alloc_dev_chrdev;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.