// SPDX-License-Identifier: GPL-2.0-only /* * VFIO: IOMMU DMA mapping support for TCE on POWER * * Copyright (C) 2013 IBM Corp. All rights reserved. * Author: Alexey Kardashevskiy <aik@ozlabs.ru> * Copyright Gavin Shan, IBM Corporation 2014. * * Derived from original vfio_iommu_type1.c: * Copyright (C) 2012 Red Hat, Inc. All rights reserved. * Author: Alex Williamson <alex.williamson@redhat.com>
*/
/* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * * This code handles mapping and unmapping of user data buffers * into DMA'ble space using the IOMMU
*/
/* * A container needs to remember which preregistered region it has * referenced to do proper cleanup at the userspace process exit.
*/ struct tce_iommu_prereg { struct list_head next; struct mm_iommu_table_group_mem_t *mem;
};
/* * The container descriptor supports only a single group per container. * Required by the API as the container is not supplied with the IOMMU group * at the moment of initialization.
*/ struct tce_container { struct mutex lock; bool enabled; bool v2; bool def_window_pending; unsignedlong locked_pages; struct mm_struct *mm; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct list_head group_list; struct list_head prereg_list;
};
page = pfn_to_page(hpa >> PAGE_SHIFT); /* * Check that the TCE table granularity is not bigger than the size of * a page we just found. Otherwise the hardware can get access to * a bigger memory chunk that it should.
*/ return page_shift(compound_head(page)) >= it_page_shift;
}
staticint tce_iommu_find_free_table(struct tce_container *container)
{ int i;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { if (!container->tables[i]) return i;
}
return -ENOSPC;
}
staticint tce_iommu_enable(struct tce_container *container)
{ int ret = 0; unsignedlong locked; struct iommu_table_group *table_group; struct tce_iommu_group *tcegrp;
if (container->enabled) return -EBUSY;
/* * When userspace pages are mapped into the IOMMU, they are effectively * locked memory, so, theoretically, we need to update the accounting * of locked pages on each map and unmap. For powerpc, the map unmap * paths can be very hot, though, and the accounting would kill * performance, especially since it would be difficult to impossible * to handle the accounting in real mode only. * * To address that, rather than precisely accounting every page, we * instead account for a worst case on locked memory when the iommu is * enabled and disabled. The worst case upper bound on locked memory * is the size of the whole iommu window, which is usually relatively * small (compared to total memory sizes) on POWER hardware. * * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. * * Unfortunately at the moment it counts whole tables, no matter how * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups * each with 2GB DMA window, 8GB will be counted here. The reason for * this is that we cannot tell here the amount of RAM used by the guest * as this information is only available from KVM and VFIO is * KVM agnostic. * * So we do not allow enabling a container without a group attached * as there is no way to know how much we should increment * the locked_vm counter.
*/ if (!tce_groups_attached(container)) return -ENODEV;
/* * If VFIO created a table, it was not disposed * by tce_iommu_detach_group() so do it now.
*/ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { struct iommu_table *tbl = container->tables[i];
ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
tbl->it_page_shift, &hpa, &mem); if (ret)
pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
__func__, be64_to_cpu(*pua), entry, ret); if (mem)
mm_iommu_mapped_dec(mem);
for ( ; entry < lastentry; ++entry) { if (tbl->it_indirect_levels && tbl->it_userspace) { /* * For multilevel tables, we can take a shortcut here * and skip some TCEs as we know that the userspace * addresses cache is a mirror of the real TCE table * and if it is missing some indirect levels, then * the hardware table does not have them allocated * either and therefore does not require updating.
*/
__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
entry); if (!pua) { /* align to level_size which is power of two */
entry |= tbl->it_level_size - 1; continue;
}
}
cond_resched();
direction = DMA_NONE;
oldhpa = 0;
ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
&direction); if (ret) continue;
if (direction == DMA_NONE) continue;
if (container->v2) {
tce_iommu_unuse_page_v2(container, tbl, entry); continue;
}
num = tce_iommu_find_free_table(container); if (num < 0) return num;
/* Get the first group for ops::create_table */
tcegrp = list_first_entry(&container->group_list, struct tce_iommu_group, next);
table_group = iommu_group_get_iommudata(tcegrp->grp); if (!table_group) return -EFAULT;
if (!(table_group->pgsizes & (1ULL << page_shift))) return -EINVAL;
if (!table_group->ops->set_window || !table_group->ops->unset_window ||
!table_group->ops->get_table_size ||
!table_group->ops->create_table) return -EPERM;
/* Create TCE table */
ret = tce_iommu_create_table(container, table_group, num,
page_shift, window_size, levels, &tbl); if (ret) return ret;
BUG_ON(!tbl->it_ops->free);
/* * Program the table to every group. * Groups have been tested for compatibility at the attach time.
*/
list_for_each_entry(tcegrp, &container->group_list, next) {
table_group = iommu_group_get_iommudata(tcegrp->grp);
ret = table_group->ops->set_window(table_group, num, tbl); if (ret) goto unset_exit;
}
container->tables[num] = tbl;
/* Return start address assigned by platform in create_table() */
*start_addr = tbl->it_offset << tbl->it_page_shift;
/* Detach groups from IOMMUs */
list_for_each_entry(tcegrp, &container->group_list, next) {
table_group = iommu_group_get_iommudata(tcegrp->grp);
/* * SPAPR TCE IOMMU exposes the default DMA window to * the guest via dma32_window_start/size of * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow * the userspace to remove this window, some do not so * here we check for the platform capability.
*/ if (!table_group->ops || !table_group->ops->unset_window) return -EPERM;
switch (cmd) { case VFIO_CHECK_EXTENSION: switch (arg) { case VFIO_SPAPR_TCE_IOMMU: case VFIO_SPAPR_TCE_v2_IOMMU: return 1; case VFIO_EEH: return eeh_enabled(); default: return 0;
}
}
/* * Sanity check to prevent one userspace from manipulating * another userspace mm.
*/
BUG_ON(!container); if (container->mm && container->mm != current->mm) return -EPERM;
if (copy_from_user(¶m, (void __user *)arg, minsz)) return -EFAULT;
if (param.argsz < minsz) return -EINVAL;
if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL;
ret = tce_iommu_create_default_window(container); if (ret) return ret;
num = tce_iommu_find_table(container, param.iova, &tbl); if (num < 0) return -ENXIO;
if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
(param.vaddr & ~IOMMU_PAGE_MASK(tbl))) return -EINVAL;
/* iova is checked by the IOMMU API */ if (param.flags & VFIO_DMA_MAP_FLAG_READ) { if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
direction = DMA_BIDIRECTIONAL; else
direction = DMA_TO_DEVICE;
} else { if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
direction = DMA_FROM_DEVICE; else return -EINVAL;
}
ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); if (ret) return ret;
if (container->v2)
ret = tce_iommu_build_v2(container, tbl,
param.iova >> tbl->it_page_shift,
param.vaddr,
param.size >> tbl->it_page_shift,
direction); else
ret = tce_iommu_build(container, tbl,
param.iova >> tbl->it_page_shift,
param.vaddr,
param.size >> tbl->it_page_shift,
direction);
iommu_flush_tce(tbl);
return ret;
} case VFIO_IOMMU_UNMAP_DMA: { struct vfio_iommu_type1_dma_unmap param; struct iommu_table *tbl = NULL; long num;
if (copy_from_user(&create, (void __user *)arg, minsz)) return -EFAULT;
if (create.argsz < minsz) return -EINVAL;
if (create.flags) return -EINVAL;
mutex_lock(&container->lock);
ret = tce_iommu_create_default_window(container); if (!ret)
ret = tce_iommu_create_window(container,
create.page_shift,
create.window_size, create.levels,
&create.start_addr);
mutex_unlock(&container->lock);
if (!ret && copy_to_user((void __user *)arg, &create, minsz))
ret = -EFAULT;
return ret;
} case VFIO_IOMMU_SPAPR_TCE_REMOVE: { struct vfio_iommu_spapr_tce_remove remove;
if (!container->v2) break;
ret = tce_iommu_mm_set(container); if (ret) return ret;
if (!tce_groups_attached(container)) return -ENXIO;
/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
iommu_group_id(iommu_group), iommu_group); */
table_group = iommu_group_get_iommudata(iommu_group); if (!table_group) {
ret = -ENODEV; goto unlock_exit;
}
/* v2 requires full support of dynamic DMA windows */ if (container->v2 && table_group->max_dynamic_windows_supported == 0) {
ret = -EINVAL; goto unlock_exit;
}
/* v1 reuses TCE tables and does not share them among PEs */ if (!container->v2 && tce_groups_attached(container)) {
ret = -EBUSY; goto unlock_exit;
}
/* * Check if new group has the same iommu_table_group_ops * (i.e. compatible)
*/
list_for_each_entry(tcegrp, &container->group_list, next) { struct iommu_table_group *table_group_tmp;
if (tcegrp->grp == iommu_group) {
pr_warn("tce_vfio: Group %d is already attached\n",
iommu_group_id(iommu_group));
ret = -EBUSY; goto unlock_exit;
}
table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); if (table_group_tmp->ops->create_table !=
table_group->ops->create_table) {
pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
iommu_group_id(iommu_group),
iommu_group_id(tcegrp->grp));
ret = -EPERM; goto unlock_exit;
}
}
tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); if (!tcegrp) {
ret = -ENOMEM; goto unlock_exit;
}
ret = tce_iommu_take_ownership(container, table_group); if (!tce_groups_attached(container) && !container->tables[0])
container->def_window_pending = true;
if (!ret) {
tcegrp->grp = iommu_group;
list_add(&tcegrp->next, &container->group_list);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.