/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
to match. That way, we can use 'unsigned long' for PFNs with impunity. */ #define DOMAIN_MAX_PFN(gaw) ((unsignedlong) min_t(uint64_t, \
__DOMAIN_MAX_PFN(gaw), (unsignedlong)-1)) #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
/* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT)
*/ staticint force_on = 0; staticint intel_iommu_tboot_noforce; staticint no_platform_optin;
/* * Take a root_entry and return the Lower Context Table Pointer (LCTP) * if marked present.
*/ static phys_addr_t root_entry_lctp(struct root_entry *re)
{ if (!(re->lo & 1)) return 0;
return re->lo & VTD_PAGE_MASK;
}
/* * Take a root_entry and return the Upper Context Table Pointer (UCTP) * if marked present.
*/ static phys_addr_t root_entry_uctp(struct root_entry *re)
{ if (!(re->hi & 1)) return 0;
/* * Looks up an IOMMU-probed device using its source ID. * * Returns the pointer to the device if there is a match. Otherwise, * returns NULL. * * Note that this helper doesn't guarantee that the device won't be * released by the iommu subsystem after being returned. The caller * should use its own synchronization mechanism to avoid the device * being released during its use if its possibly the case.
*/ struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
{ struct device_domain_info *info = NULL; struct rb_node *node; unsignedlong flags;
spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); if (node)
info = rb_entry(node, struct device_domain_info, node);
spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
/* * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of * the returned SAGAW.
*/ staticunsignedlong __iommu_calculate_sagaw(struct intel_iommu *iommu)
{ unsignedlong fl_sagaw, sl_sagaw;
/* Second level only. */ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return sl_sagaw;
/* First level only. */ if (!ecap_slts(iommu->ecap)) return fl_sagaw;
return fl_sagaw & sl_sagaw;
}
staticint __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
{ unsignedlong sagaw; int agaw;
sagaw = __iommu_calculate_sagaw(iommu); for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { if (test_bit(agaw, &sagaw)) break;
}
return agaw;
}
/* * Calculate max SAGAW for each iommu.
*/ int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
{ return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
}
/* * calculate agaw for each iommu. * "SAGAW" may be different across iommus, use a default agaw, and * get a supported less agaw for iommus that don't support the default agaw.
*/ int iommu_calculate_agaw(struct intel_iommu *iommu)
{ return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
}
/* * Except that the caller requested to allocate a new entry, * returning a copied context entry makes no sense.
*/ if (!alloc && context_copied(iommu, bus, devfn)) return NULL;
entry = &root->lo; if (sm_supported(iommu)) { if (devfn >= 0x80) {
devfn -= 0x80;
entry = &root->hi;
}
devfn *= 2;
} if (*entry & 1)
context = phys_to_virt(*entry & VTD_PAGE_MASK); else { unsignedlong phy_addr; if (!alloc) return NULL;
context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
SZ_4K); if (!context) return NULL;
/* We know that this device on this chipset has its own IOMMU. * If we find it under a different IOMMU, then the BIOS is lying * to us. Hope that the IOMMU for this device is actually * disabled, and it needs no translation...
*/
rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); if (rc) { /* "can't" happen */
dev_info(&pdev->dev, "failed to run vt-d quirk\n"); returnfalse;
}
vtbar &= 0xffff0000;
/* we know that the this iommu should be at offset 0xa000 from vtbar */
drhd = dmar_find_matched_drhd_unit(pdev); if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); returntrue;
}
/* VFs aren't listed in scope tables; we need to look up
* the PF instead to find the IOMMU. */
pf_pdev = pci_physfn(pdev);
dev = &pf_pdev->dev;
segment = pci_domain_nr(pdev->bus);
} elseif (has_acpi_companion(dev))
dev = &ACPI_COMPANION(dev)->dev;
for_each_active_dev_scope(drhd->devices,
drhd->devices_cnt, i, tmp) { if (tmp == dev) { /* For a VF use its original BDF# not that of the PF * which we used for the IOMMU lookup. Strictly speaking * we could do this for all PCI devices; we only need to
* get the BDF# from the scope table for ACPI matches. */ if (pdev && pdev->is_virtfn) goto got_pdev;
/* get the pointer to the pasid table entry */
entries = get_pasid_table_from_pde(pde); if (!entries) {
pr_info("pasid table is not present\n"); return;
}
index = pasid & PASID_PTE_MASK;
pte = &entries[index]; for (i = 0; i < ARRAY_SIZE(pte->val); i++)
pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
if (!pasid_pte_is_present(pte)) {
pr_info("scalable mode page table is not present\n"); return;
}
tmp = 0ULL; if (!try_cmpxchg64(&pte->val, &tmp, pteval)) /* Someone else set it while we were thinking; use theirs. */
iommu_free_pages(tmp_page); else
domain_flush_cache(domain, pte, sizeof(*pte));
} if (level == 1) break;
/* return address's pte at specific level */ staticstruct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, unsignedlong pfn, int level, int *large_page)
{ struct dma_pte *parent, *pte; int total = agaw_to_level(domain->agaw); int offset;
/* * Free the page table if we're below the level we want to * retain and the range covers the entire table.
*/ if (level < retain_level && !(start_pfn > level_pfn ||
last_pfn < level_pfn + level_size(level) - 1)) {
dma_clear_pte(pte);
domain_flush_cache(domain, pte, sizeof(*pte));
iommu_free_pages(level_pte);
}
next:
pfn += level_size(level);
} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
}
/* * clear last level (leaf) ptes and free page table pages below the * level we wish to keep intact.
*/ staticvoid dma_pte_free_pagetable(struct dmar_domain *domain, unsignedlong start_pfn, unsignedlong last_pfn, int retain_level)
{
dma_pte_clear_range(domain, start_pfn, last_pfn);
/* We don't need lock here; nobody else touches the iova range */
dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
domain->pgd, 0, start_pfn, last_pfn);
/* When a page at a given level is being unlinked from its parent, we don't need to *modify* it at all. All we need to do is make a list of all the pages which can be freed just as soon as we've flushed the IOTLB and we know the hardware page-walk will no longer touch them. The 'pte' argument is the *parent* PTE, pointing to the page that is to
be freed. */ staticvoid dma_pte_list_pagetables(struct dmar_domain *domain, int level, struct dma_pte *parent_pte, struct iommu_pages_list *freelist)
{ struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
iommu_pages_list_add(freelist, pte);
if (level == 1) return;
do { if (dma_pte_present(pte) && !dma_pte_superpage(pte))
dma_pte_list_pagetables(domain, level - 1, pte, freelist);
pte++;
} while (!first_pte_in_page(pte));
}
do { unsignedlong level_pfn = pfn & level_mask(level);
if (!dma_pte_present(pte)) goto next;
/* If range covers entire pagetable, free it */ if (start_pfn <= level_pfn &&
last_pfn >= level_pfn + level_size(level) - 1) { /* These suborbinate page tables are going away entirely. Don't
bother to clear them; we're just going to *free* them. */ if (level > 1 && !dma_pte_superpage(pte))
dma_pte_list_pagetables(domain, level - 1, pte, freelist);
dma_clear_pte(pte); if (!first_pte)
first_pte = pte;
last_pte = pte;
} elseif (level > 1) { /* Recurse down into a level that isn't *entirely* obsolete */
dma_pte_clear_level(domain, level - 1,
phys_to_virt(dma_pte_addr(pte)),
level_pfn, start_pfn, last_pfn,
freelist);
}
next:
pfn = level_pfn + level_size(level);
} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
if (first_pte)
domain_flush_cache(domain, first_pte,
(void *)++last_pte - (void *)first_pte);
}
/* We can't just free the pages because the IOMMU may still be walking the page tables, and may have cached the intermediate levels. The
pages can only be freed after the IOTLB flush has been done. */ staticvoid domain_unmap(struct dmar_domain *domain, unsignedlong start_pfn, unsignedlong last_pfn, struct iommu_pages_list *freelist)
{ if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
WARN_ON(start_pfn > last_pfn)) return;
/* we don't need lock here; nobody else touches the iova range */
dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
domain->pgd, 0, start_pfn, last_pfn, freelist);
/* return value determine if we need a write buffer flush */ staticvoid __iommu_flush_context(struct intel_iommu *iommu,
u16 did, u16 source_id, u8 function_mask,
u64 type)
{
u64 val = 0; unsignedlong flag;
switch (type) { case DMA_CCMD_GLOBAL_INVL:
val = DMA_CCMD_GLOBAL_INVL; break; case DMA_CCMD_DOMAIN_INVL:
val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); break; case DMA_CCMD_DEVICE_INVL:
val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); break; default:
pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
iommu->name, type); return;
}
val |= DMA_CCMD_ICC;
switch (type) { case DMA_TLB_GLOBAL_FLUSH: /* global flush doesn't need set IVA_REG */
val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; break; case DMA_TLB_DSI_FLUSH:
val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); break; case DMA_TLB_PSI_FLUSH:
val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); /* IH bit is passed in as part of address */
val_iva = size_order | addr; break; default:
pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
iommu->name, type); return;
}
if (cap_write_drain(iommu->cap))
val |= DMA_TLB_WRITE_DRAIN;
raw_spin_lock_irqsave(&iommu->register_lock, flag); /* Note: Only uses first TLB reg currently */ if (val_iva)
dmar_writeq(iommu->reg + tlb_offset, val_iva);
dmar_writeq(iommu->reg + tlb_offset + 8, val);
/* Make sure hardware complete it */
IOMMU_WAIT_OP(iommu, tlb_offset + 8,
dmar_readq, (!(val & DMA_TLB_IVT)), val);
/* * The extra devTLB flush quirk impacts those QAT devices with PCI device * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() * check because it applies only to the built-in QAT devices and it doesn't * grant additional privileges.
*/ #define BUGGY_QAT_DEVID_MASK 0x4940 staticbool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
{ if (pdev->vendor != PCI_VENDOR_ID_INTEL) returnfalse;
if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) returnfalse;
staticvoid disable_dmar_iommu(struct intel_iommu *iommu)
{ /* * All iommu domains must have been detached from the devices, * hence there should be no domain IDs in use.
*/ if (WARN_ON(!ida_is_empty(&iommu->domain_ida))) return;
if (iommu->gcmd & DMA_GCMD_TE)
iommu_disable_translation(iommu);
}
if (ecap_prs(iommu->ecap))
intel_iommu_finish_prq(iommu);
}
/* * Check and return whether first level is used by default for * DMA translation.
*/ staticbool first_level_by_default(struct intel_iommu *iommu)
{ /* Only SL is available in legacy mode */ if (!sm_supported(iommu)) returnfalse;
/* Only level (either FL or SL) is available, just use it */ if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) return ecap_flts(iommu->ecap);
returntrue;
}
int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{ struct iommu_domain_info *info, *curr; int num, ret = -ENOSPC;
if (domain->domain.type == IOMMU_DOMAIN_SVA) return 0;
info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM;
if (domain->domain.type == IOMMU_DOMAIN_SVA) return;
guard(mutex)(&iommu->did_lock);
info = xa_load(&domain->iommu_array, iommu->seq_id); if (--info->refcnt == 0) {
ida_free(&iommu->domain_ida, info->did);
xa_erase(&domain->iommu_array, iommu->seq_id);
kfree(info);
}
}
/* * For kdump cases, old valid entries may be cached due to the * in-flight DMA and copied pgtable, but there is no unmapping * behaviour for them, thus we need an explicit cache flush for * the newly-mapped device. For kdump, at this point, the device * is supposed to finish reset at its driver probe stage, so no * in-flight DMA will exist, and we don't need to worry anymore * hereafter.
*/ staticvoid copied_context_tear_down(struct intel_iommu *iommu, struct context_entry *context,
u8 bus, u8 devfn)
{
u16 did_old;
/* * It's a non-present to present mapping. If hardware doesn't cache * non-present entry we only need to flush the write-buffer. If the * _does_ cache non-present entries, then it does so in the special * domain #0, which we have to flush:
*/ staticvoid context_present_cache_flush(struct intel_iommu *iommu, u16 did,
u8 bus, u8 devfn)
{ if (cap_caching_mode(iommu->cap)) {
iommu->flush.flush_context(iommu, 0,
PCI_DEVID(bus, devfn),
DMA_CCMD_MASK_NOBIT,
DMA_CCMD_DEVICE_INVL);
iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
} else {
iommu_flush_write_buffer(iommu);
}
}
if (!dev_is_pci(dev)) return domain_context_mapping_one(domain, iommu, bus, devfn);
ret = pci_for_each_dma_alias(to_pci_dev(dev),
domain_context_mapping_cb, domain); if (ret) return ret;
iommu_enable_pci_ats(info);
return 0;
}
/* Return largest possible superpage level for a given mapping */ staticint hardware_largepage_caps(struct dmar_domain *domain, unsignedlong iov_pfn, unsignedlong phy_pfn, unsignedlong pages)
{ int support, level = 1; unsignedlong pfnmerge;
support = domain->iommu_superpage;
/* To use a large page, the virtual *and* physical addresses must be aligned to 2MiB/1GiB/etc. Lower bits set in either of them will mean we have to use smaller pages. So just
merge them and check both at once. */
pfnmerge = iov_pfn | phy_pfn;
/* * Ensure that old small page tables are removed to make room for superpage(s). * We're going to add new large pages, so make sure we don't remove their parent * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
*/ staticvoid switch_to_super_page(struct dmar_domain *domain, unsignedlong start_pfn, unsignedlong end_pfn, int level)
{ unsignedlong lvl_pages = lvl_to_nr_pages(level); struct dma_pte *pte = NULL;
if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
!IS_ALIGNED(end_pfn + 1, lvl_pages))) return;
while (start_pfn <= end_pfn) { if (!pte)
pte = pfn_to_dma_pte(domain, start_pfn, &level,
GFP_ATOMIC);
if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) return -EINVAL;
if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL;
if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); return -EINVAL;
}
/* If the next PTE would be the first in a new page, then we * need to flush the cache on the entries we've just written. * And then we'll need to recalculate 'pte', so clear it and * let it get set again in the if (!pte) block above. * * If we're done (!nr_pages) we need to flush the cache too. * * Also if we've been setting superpages, we may need to * recalculate 'pte' and switch back to smaller pages for the * end of the mapping, if the trailing size is not enough to * use another superpage (i.e. nr_pages < lvl_pages).
*/
pte++; if (!nr_pages || first_pte_in_page(pte) ||
(largepage_lvl > 1 && nr_pages < lvl_pages)) {
domain_flush_cache(domain, first_pte,
(void *)pte - (void *)first_pte);
pte = NULL;
}
}
/** * device_rmrr_is_relaxable - Test whether the RMRR of this device * is relaxable (ie. is allowed to be not enforced under some conditions) * @dev: device handle * * We assume that PCI USB devices with RMRRs have them largely * for historical reasons and that the RMRR space is not actively used post * boot. This exclusion may change if vendors begin to abuse it. * * The same exception is made for graphics devices, with the requirement that * any use of the RMRR regions will be torn down before assigning the device * to a guest. * * Return: true if the RMRR is relaxable, false otherwise
*/ staticbool device_rmrr_is_relaxable(struct device *dev)
{ struct pci_dev *pdev;
/* * Hardware does not support the passthrough translation mode. * Always use a dynamaic mapping domain.
*/ if (!ecap_pass_through(iommu->ecap)) return IOMMU_DOMAIN_DMA;
if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev);
if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) return IOMMU_DOMAIN_IDENTITY;
}
return 0;
}
staticvoid intel_iommu_init_qi(struct intel_iommu *iommu)
{ /* * Start from the sane iommu hardware state. * If the queued invalidation is already initialized by us * (for example, while enabling interrupt-remapping) then * we got the things already rolling from a sane state.
*/ if (!iommu->qi) { /* * Clear any previous faults.
*/
dmar_fault(-1, iommu); /* * Disable queued invalidation if supported and already enabled * before OS handover.
*/
dmar_disable_qi(iommu);
}
if (dmar_enable_qi(iommu)) { /* * Queued Invalidate not enabled, use Register Based Invalidate
*/
iommu->flush.flush_context = __iommu_flush_context;
iommu->flush.flush_iotlb = __iommu_flush_iotlb;
pr_info("%s: Using Register based invalidation\n",
iommu->name);
} else {
iommu->flush.flush_context = qi_flush_context;
iommu->flush.flush_iotlb = qi_flush_iotlb;
pr_info("%s: Using Queued invalidation\n", iommu->name);
}
}
for (devfn = 0; devfn < 256; devfn++) { /* First calculate the correct index */
idx = (ext ? devfn * 2 : devfn) % 256;
if (idx == 0) { /* First save what we may have and clean up */ if (new_ce) {
tbl[tbl_idx] = new_ce;
__iommu_flush_cache(iommu, new_ce,
VTD_PAGE_SIZE);
pos = 1;
}
if (old_ce)
memunmap(old_ce);
ret = 0; if (devfn < 0x80)
old_ce_phys = root_entry_lctp(&re); else
old_ce_phys = root_entry_uctp(&re);
if (!old_ce_phys) { if (ext && devfn == 0) { /* No LCTP, try UCTP */
devfn = 0x7f; continue;
} else { goto out;
}
}
ret = -ENOMEM;
old_ce = memremap(old_ce_phys, PAGE_SIZE,
MEMREMAP_WB); if (!old_ce) goto out;
new_ce = iommu_alloc_pages_node_sz(iommu->node,
GFP_KERNEL, SZ_4K); if (!new_ce) goto out_unmap;
ret = 0;
}
/* Now copy the context entry */
memcpy(&ce, old_ce + idx, sizeof(ce));
if (!context_present(&ce)) continue;
did = context_domain_id(&ce); if (did >= 0 && did < cap_ndoms(iommu->cap))
ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
/* * The RTT bit can only be changed when translation is disabled, * but disabling translation means to open a window for data * corruption. So bail out and don't copy anything if we would * have to change the bit.
*/ if (new_ext != ext) return -EINVAL;
iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); if (!iommu->copied_tables) return -ENOMEM;
old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; if (!old_rt_phys) return -EINVAL;
old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); if (!old_rt) return -ENOMEM;
/* This is too big for the stack - allocate it from slab */
ctxt_table_entries = ext ? 512 : 256;
ret = -ENOMEM;
ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); if (!ctxt_tbls) goto out_unmap;
for (bus = 0; bus < 256; bus++) {
ret = copy_context_table(iommu, &old_rt[bus],
ctxt_tbls, bus, ext); if (ret) {
pr_err("%s: Failed to copy context table for bus %d\n",
iommu->name, bus); continue;
}
}
spin_lock(&iommu->lock);
/* Context tables are copied, now write them to the root_entry table */ for (bus = 0; bus < 256; bus++) { int idx = ext ? bus * 2 : bus;
u64 val;
if (ctxt_tbls[idx]) {
val = virt_to_phys(ctxt_tbls[idx]) | 1;
iommu->root_entry[bus].lo = val;
}
for_each_iommu(iommu, drhd) { if (drhd->ignored) {
iommu_disable_translation(iommu); continue;
}
/* * Find the max pasid size of all IOMMU's in the system. * We need to ensure the system pasid table is no bigger * than the smallest supported.
*/ if (pasid_supported(iommu)) {
u32 temp = 2 << ecap_pss(iommu->ecap);
if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
iommu_disable_translation(iommu);
clear_translation_pre_enabled(iommu);
pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
iommu->name);
}
/* * TBD: * we could share the same root & context tables * among all IOMMU's. Need to Split it later.
*/
ret = iommu_alloc_root_entry(iommu); if (ret) goto free_iommu;
if (translation_pre_enabled(iommu)) {
pr_info("Translation already enabled - trying to copy translation structures\n");
ret = copy_translation_tables(iommu); if (ret) { /* * We found the IOMMU with translation * enabled - but failed to copy over the * old root-entry table. Try to proceed * by disabling translation now and * allocating a clean root-entry table. * This might cause DMAR faults, but * probably the dump will still succeed.
*/
pr_err("Failed to copy translation tables from previous kernel for %s\n",
iommu->name);
iommu_disable_translation(iommu);
clear_translation_pre_enabled(iommu);
} else {
pr_info("Copied translation tables from previous kernel for %s\n",
iommu->name);
}
}
intel_svm_check(iommu);
}
/* * Now that qi is enabled on all iommus, set the root entry and flush * caches. This is required on some Intel X58 chipsets, otherwise the * flush_context function will loop forever and the boot hangs.
*/
for_each_active_iommu(iommu, drhd) {
iommu_flush_write_buffer(iommu);
iommu_set_root_entry(iommu);
}
check_tylersburg_isoch();
/* * for each drhd * enable fault log * global invalidate context cache * global invalidate iotlb * enable translation
*/
for_each_iommu(iommu, drhd) { if (drhd->ignored) { /* * we always have to disable PMRs or DMA may fail on * this device
*/ if (force_on)
iommu_disable_protect_mem_regions(iommu); continue;
}
iommu_flush_write_buffer(iommu);
if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition.
*/
up_write(&dmar_global_lock);
ret = intel_iommu_enable_prq(iommu);
down_write(&dmar_global_lock); if (ret) goto free_iommu;
}
ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu;
}
for_each_drhd_unit(drhd) { if (!drhd->include_all) {
for_each_active_dev_scope(drhd->devices,
drhd->devices_cnt, i, dev) break; /* ignore DMAR unit if no devices exist */ if (i == drhd->devices_cnt)
drhd->ignored = 1;
}
}
for_each_active_drhd_unit(drhd) { if (drhd->include_all) continue;
for_each_active_dev_scope(drhd->devices,
drhd->devices_cnt, i, dev) if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) break; if (i < drhd->devices_cnt) continue;
/* This IOMMU has *only* gfx devices. Either bypass it or
set the gfx_mapped flag, as appropriate */
drhd->gfx_dedicated = 1; if (disable_igfx_iommu)
drhd->ignored = 1;
}
}
for_each_active_iommu(iommu, drhd) { if (iommu->qi) {
ret = dmar_reenable_qi(iommu); if (ret) return ret;
}
}
for_each_iommu(iommu, drhd) { if (drhd->ignored) { /* * we always have to disable PMRs or DMA may fail on * this device
*/ if (force_on)
iommu_disable_protect_mem_regions(iommu); continue;
}
if (init_iommu_hw()) { if (force_on)
panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); else
WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); return;
}
atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); if (!atsru) return -ENOMEM;
/* * If memory is allocated from slab by ACPI _DSM method, we need to * copy the memory content because the memory buffer will be freed * on return.
*/
atsru->hdr = (void *)(atsru + 1);
memcpy(atsru->hdr, hdr, hdr->length);
atsru->include_all = atsr->flags & 0x1; if (!atsru->include_all) {
atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
(void *)atsr + atsr->header.length,
&atsru->devices_cnt); if (atsru->devices_cnt && atsru->devices == NULL) {
kfree(atsru); return -ENOMEM;
}
}
/* * Disable translation if already enabled prior to OS handover.
*/ if (iommu->gcmd & DMA_GCMD_TE)
iommu_disable_translation(iommu);
ret = iommu_alloc_root_entry(iommu); if (ret) goto out;
intel_svm_check(iommu);
if (dmaru->ignored) { /* * we always have to disable PMRs or DMA may fail on this device
*/ if (force_on)
iommu_disable_protect_mem_regions(iommu); return 0;
}
dev = pci_physfn(dev);
satcu = dmar_find_matched_satc_unit(dev); if (satcu) /* * This device supports ATS as it is in SATC table. * When IOMMU is in legacy mode, enabling ATS is done * automatically by HW for the device that requires * ATS, hence OS should not enable this device ATS * to avoid duplicated TLB invalidation.
*/ return !(satcu->atc_required && !sm_supported(iommu));
for (bus = dev->bus; bus; bus = bus->parent) {
bridge = bus->self; /* If it's an integrated device, allow ATS */ if (!bridge) returntrue; /* Connected via non-PCIe: no ATS */ if (!pci_is_pcie(bridge) ||
pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) returnfalse; /* If we found the root port, look it up in the ATSR */ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) break;
}
/* * All other CPUs were brought down, hotplug interrupts were disabled, * no lock and RCU checking needed anymore
*/
list_for_each_entry(drhd, &dmar_drhd_units, list) {
iommu = drhd->iommu;
if (no_iommu || dmar_disabled)
pr_info("Intel-IOMMU force enabled due to platform opt in\n");
/* * If Intel-IOMMU is disabled by default, we will apply identity * map for all devices except those marked as being untrusted.
*/ if (dmar_disabled)
iommu_set_default_passthrough(false);
dmar_disabled = 0;
no_iommu = 0;
return 1;
}
staticint __init probe_acpi_namespace_devices(void)
{ struct dmar_drhd_unit *drhd; /* To avoid a -Wunused-but-set-variable warning. */ struct intel_iommu *iommu __maybe_unused; struct device *dev; int i, ret = 0;
up_read(&dmar_global_lock);
adev = to_acpi_device(dev);
mutex_lock(&adev->physical_node_lock);
list_for_each_entry(pn,
&adev->physical_node_list, node) {
ret = iommu_probe_device(pn->dev); if (ret) break;
}
mutex_unlock(&adev->physical_node_lock);
down_read(&dmar_global_lock);
if (ret) return ret;
}
}
return 0;
}
static __init int tboot_force_iommu(void)
{ if (!tboot_enabled()) return 0;
if (no_iommu || dmar_disabled)
pr_warn("Forcing Intel-IOMMU to enabled\n");
dmar_disabled = 0;
no_iommu = 0;
return 1;
}
int __init intel_iommu_init(void)
{ int ret = -ENODEV; struct dmar_drhd_unit *drhd; struct intel_iommu *iommu;
/* * Intel IOMMU is required for a TXT/tboot launch or platform * opt in, so enforce that.
*/
force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
platform_optin_force_iommu();
down_write(&dmar_global_lock); if (dmar_table_init()) { if (force_on)
panic("tboot: Failed to initialize DMAR table\n"); goto out_free_dmar;
}
if (dmar_dev_scope_init() < 0) { if (force_on)
panic("tboot: Failed to initialize DMAR device scope\n"); goto out_free_dmar;
}
up_write(&dmar_global_lock);
/* * The bus notifier takes the dmar_global_lock, so lockdep will * complain later when we register it under the lock.
*/
dmar_register_bus_notifier();
down_write(&dmar_global_lock);
if (!no_iommu)
intel_iommu_debugfs_init();
if (no_iommu || dmar_disabled) { /* * We exit the function here to ensure IOMMU's remapping and * mempool aren't setup, which means that the IOMMU's PMRs * won't be disabled via the call to init_dmars(). So disable * it explicitly here. The PMRs were setup by tboot prior to * calling SENTER, but the kernel is expected to reset/tear * down the PMRs.
*/ if (intel_iommu_tboot_noforce) {
for_each_iommu(iommu, drhd)
iommu_disable_protect_mem_regions(iommu);
}
/* * Make sure the IOMMUs are switched off, even when we * boot into a kexec kernel and the previous kernel left * them enabled
*/
intel_disable_iommus(); goto out_free_dmar;
}
if (list_empty(&dmar_rmrr_units))
pr_info("No RMRR found\n");
if (list_empty(&dmar_atsr_units))
pr_info("No ATSR found\n");
if (list_empty(&dmar_satc_units))
pr_info("No SATC found\n");
init_no_remapping_devices();
ret = init_dmars(); if (ret) { if (force_on)
panic("tboot: Failed to initialize DMARs\n");
pr_err("Initialization failed\n"); goto out_free_dmar;
}
up_write(&dmar_global_lock);
init_iommu_pm_ops();
down_read(&dmar_global_lock);
for_each_active_iommu(iommu, drhd) { /* * The flush queue implementation does not perform * page-selective invalidations that are required for efficient * TLB flushes in virtual environments. The benefit of batching * is likely to be much lower than the overhead of synchronizing * the virtual and physical IOMMU page-tables.
*/ if (cap_caching_mode(iommu->cap) &&
!first_level_by_default(iommu)) {
pr_info_once("IOMMU batching disallowed due to virtualization\n");
iommu_set_dma_strict();
}
iommu_device_sysfs_add(&iommu->iommu, NULL,
intel_iommu_groups, "%s", iommu->name); /* * The iommu device probe is protected by the iommu_probe_device_lock. * Release the dmar_global_lock before entering the device probe path * to avoid unnecessary lock order splat.
*/
up_read(&dmar_global_lock);
iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
down_read(&dmar_global_lock);
iommu_pmu_register(iommu);
}
if (probe_acpi_namespace_devices())
pr_warn("ACPI name space devices didn't probe correctly\n");
/* Finally, we enable the DMA remapping hardware. */
for_each_iommu(iommu, drhd) { if (!drhd->ignored && !translation_pre_enabled(iommu))
iommu_enable_translation(iommu);
domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); return 0;
}
/* * NB - intel-iommu lacks any sort of reference counting for the users of * dependent devices. If multiple endpoints have intersecting dependent * devices, unbinding the driver from any one of them will possibly leave * the others unable to operate.
*/ staticvoid domain_context_clear(struct device_domain_info *info)
{ if (!dev_is_pci(info->dev)) {
domain_context_clear_one(info, info->bus, info->devfn); return;
}
/* * Clear the page table pointer in context or pasid table entries so that * all DMA requests without PASID from the device are blocked. If the page * table has been set, clean up the data structures.
*/ void device_block_translation(struct device *dev)
{ struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; unsignedlong flags;
/* Device in DMA blocking state. Noting to do. */ if (!info->domain_attached) return;
if (info->domain)
cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu))
intel_pasid_tear_down_entry(iommu, dev,
IOMMU_NO_PASID, false); else
domain_context_clear(info);
}
/* Device now in DMA blocking state. */
info->domain_attached = false;
/* * IOVA aperture: First-level translation restricts the input-address * to a canonical address (i.e., address bits 63:N have the same value * as address bit [N-1], where N is 48-bits with 4-level paging and * 57-bits with 5-level paging). Hence, skip bit [N-1].
*/
domain->domain.geometry.force_aperture = true;
domain->domain.geometry.aperture_start = 0; if (first_stage)
domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); else
domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
/* always allocate the top pgd */
domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K); if (!domain->pgd) {
kfree(domain); return ERR_PTR(-ENOMEM);
}
domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP);
/* Only SL is available in legacy mode */ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP);
dmar_domain = paging_domain_alloc(dev, true); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain);
dmar_domain->domain.ops = &intel_fs_paging_domain_ops; /* * iotlb sync for map is only needed for legacy implementations that * explicitly require flushing internal write buffers to ensure memory * coherence.
*/ if (rwbf_required(iommu))
dmar_domain->iotlb_sync_map = true;
/* Prefer first stage if possible by default. */
domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags); if (domain != ERR_PTR(-EOPNOTSUPP)) return domain; return intel_iommu_domain_alloc_second_stage(dev, iommu, flags);
}
if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) return -EINVAL; if (dmar_domain->nested_parent && !nested_supported(iommu)) return -EINVAL;
/* Legacy mode always supports second stage */ if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) return -EINVAL;
/* Same page size support */ if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) return -EINVAL; if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) return -EINVAL;
/* iotlb sync on map requirement */ if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) &&
!dmar_domain->iotlb_sync_map) return -EINVAL;
return 0;
}
int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
{ struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int ret = -EINVAL; int addr_width;
if (intel_domain_is_fs_paging(dmar_domain))
ret = paging_domain_compatible_first_stage(dmar_domain, iommu); elseif (intel_domain_is_ss_paging(dmar_domain))
ret = paging_domain_compatible_second_stage(dmar_domain, iommu); elseif (WARN_ON(true))
ret = -EINVAL; if (ret) return ret;
/* * FIXME this is locked wrong, it needs to be under the * dmar_domain->lock
*/ if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL;
if (dmar_domain->iommu_coherency !=
iommu_paging_structure_coherency(iommu)) return -EINVAL;
/* check if this iommu agaw is sufficient for max mapped address */
addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap))
addr_width = cap_mgaw(iommu->cap);
if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) return -EINVAL;
if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
context_copied(iommu, info->bus, info->devfn)) return intel_pasid_setup_sm_context(dev);
return 0;
}
staticint intel_iommu_attach_device(struct iommu_domain *domain, struct device *dev)
{ int ret;
device_block_translation(dev);
ret = paging_domain_compatible(domain, dev); if (ret) return ret;
ret = iopf_for_domain_set(domain, dev); if (ret) return ret;
ret = dmar_domain_attach_device(to_dmar_domain(domain), dev); if (ret)
iopf_for_domain_remove(domain, dev);
/* check if minimum agaw is sufficient for mapped address */
end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; if (end < max_addr) {
pr_err("%s: iommu width (%d) is not " "sufficient for the mapped address (%llx)\n",
__func__, dmar_domain->gaw, max_addr); return -EFAULT;
}
dmar_domain->max_addr = max_addr;
} /* Round up size to next multiple of PAGE_SIZE, if it and
the low bits of hpa would take us onto the next page */
size = aligned_nrpages(hpa, size); return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
}
/* Cope with horrid API which requires us to unmap more than the
size argument if it happens to be a large-page mapping. */ if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
&level, GFP_ATOMIC))) return 0;
if (dmar_domain->max_addr == iova + size)
dmar_domain->max_addr = iova;
/* * We do not use page-selective IOTLB invalidation in flush queue, * so there is no need to track page and sync iotlb.
*/ if (!iommu_iotlb_gather_queued(gather))
iommu_iotlb_gather_add_page(domain, gather, iova, size);
guard(spinlock_irqsave)(&dmar_domain->lock); if (!domain_support_force_snooping(dmar_domain) ||
dmar_domain->has_mappings) returnfalse;
/* * Second level page table supports per-PTE snoop control. The * iommu_map() interface will handle this by setting SNP bit.
*/
dmar_domain->set_pte_snp = true;
dmar_domain->force_snooping = true; returntrue;
}
switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: case IOMMU_CAP_DEFERRED_FLUSH: returntrue; case IOMMU_CAP_PRE_BOOT_PROTECTION: return dmar_platform_optin(); case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY_TRACKING: return ssads_supported(info->iommu); default: returnfalse;
}
}
/* * For IOMMU that supports device IOTLB throttling * (DIT), we assign PFSID to the invalidation desc * of a VF such that IOMMU HW can gauge queue depth * at PF level. If DIT is not set, PFSID will be * treated as reserved, which should be set to 0.
*/ if (ecap_dit(iommu->ecap))
info->pfsid = pci_dev_id(pci_physfn(pdev));
info->ats_qdep = pci_ats_queue_depth(pdev);
} if (sm_supported(iommu)) { if (pasid_supported(iommu)) { int features = pci_pasid_features(pdev);
if (features >= 0)
info->pasid_supported = features | 1;
}
/* * The PCIe spec, in its wisdom, declares that the behaviour of the * device is undefined if you enable PASID support after ATS support. * So always enable PASID support on devices which have it, even if * we can't yet know if we're ever going to use it.
*/ if (info->pasid_supported &&
!pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
info->pasid_enabled = 1;
if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
iommu_enable_pci_ats(info); /* Assign a DEVTLB cache tag to the default domain. */ if (info->ats_enabled && info->domain) {
u16 did = domain_id_iommu(info->domain, iommu);
/* * Check that the device does not live on an external facing PCI port that is * marked as untrusted. Such devices should not be able to apply quirks and * thus not be able to bypass the IOMMU restrictions.
*/ staticbool risky_device(struct pci_dev *pdev)
{ if (pdev->untrusted) {
pci_info(pdev, "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
pdev->vendor, pdev->device);
pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); returntrue;
} returnfalse;
}
/* * Set dirty tracking for the device list of a domain. The caller must * hold the domain->lock when calling it.
*/ staticint device_set_dirty_tracking(struct list_head *devices, bool enable)
{ struct device_domain_info *info; int ret = 0;
list_for_each_entry(info, devices, link) {
ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
IOMMU_NO_PASID, enable); if (ret) break;
}
/* * IOMMUFD core calls into a dirty tracking disabled domain without an * IOVA bitmap set in order to clean dirty bits in all PTEs that might * have occurred when we stopped dirty tracking. This ensures that we * never inherit dirtied bits from a previous cycle.
*/ if (!dmar_domain->dirty_tracking && dirty->bitmap) return -EINVAL;
/* * In pass through mode, AW must be programmed to indicate the largest * AGAW value supported by hardware. And ASR is ignored by hardware.
*/
context_set_address_width(context, iommu->msagaw);
context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
context_set_fault_enable(context);
context_set_present(context); if (!ecap_coherent(iommu->ecap))
clflush_cache_range(context, sizeof(*context));
context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
spin_unlock(&iommu->lock);
/* * No PRI support with the global identity domain. No need to enable or * disable PRI in this path as the iommu has been put in the blocking * state.
*/ if (sm_supported(iommu))
ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); else
ret = device_setup_pass_through(dev);
staticvoid quirk_iommu_rwbf(struct pci_dev *dev)
{ if (risky_device(dev)) return;
/* * Mobile 4 Series Chipset neglects to set RWBF capability, * but needs it. Same seems to hold for the desktop versions.
*/
pci_info(dev, "Forcing write-buffer flush capability\n");
rwbf_quirk = 1;
}
if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
disable_igfx_iommu = 1;
} elseif (!disable_igfx_iommu) { /* we have to ensure the gfx device is idle before we flush */
pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
iommu_set_dma_strict();
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
ver = (dev->device >> 8) & 0xff; if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
ver != 0x4e && ver != 0x8a && ver != 0x98 &&
ver != 0x9a && ver != 0xa7 && ver != 0x7d) return;
/* On Tylersburg chipsets, some BIOSes have been known to enable the ISOCH DMAR unit for the Azalia sound device, but not give it any TLB entries, which causes it to deadlock. Check for that. We do this in a function called from init_dmars(), instead of in a PCI quirk, because we don't want to print the obnoxious "BIOS broken" message if VT-d is actually disabled.
*/ staticvoid __init check_tylersburg_isoch(void)
{ struct pci_dev *pdev;
uint32_t vtisochctrl;
/* If there's no Azalia in the system anyway, forget it. */
pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); if (!pdev) return;
if (risky_device(pdev)) {
pci_dev_put(pdev); return;
}
pci_dev_put(pdev);
/* System Management Registers. Might be hidden, in which case we can't do the sanity check. But that's OK, because the
known-broken BIOSes _don't_ actually hide it, so far. */
pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); if (!pdev) return;
if (risky_device(pdev)) {
pci_dev_put(pdev); return;
}
if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
pci_dev_put(pdev); return;
}
pci_dev_put(pdev);
/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ if (vtisochctrl & 1) return;
/* Drop all bits other than the number of TLB entries */
vtisochctrl &= 0x1c;
/* If we have the recommended number of TLB entries (16), fine. */ if (vtisochctrl == 0x10) return;
/* Zero TLB entries? You get to ride the short bus to school. */ if (!vtisochctrl) {
WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
dmi_get_system_info(DMI_BIOS_VENDOR),
dmi_get_system_info(DMI_BIOS_VERSION),
dmi_get_system_info(DMI_PRODUCT_VERSION));
iommu_identity_mapping |= IDENTMAP_AZALIA; return;
}
pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
vtisochctrl);
}
/* * Here we deal with a device TLB defect where device may inadvertently issue ATS * invalidation completion before posted writes initiated with translated address * that utilized translations matching the invalidation address range, violating * the invalidation completion ordering. * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is * vulnerable to this defect. In other words, any dTLB invalidation initiated not * under the control of the trusted/privileged host device driver must use this * quirk. * Device TLBs are invalidated under the following six conditions: * 1. Device driver does DMA API unmap IOVA * 2. Device driver unbind a PASID from a process, sva_unbind_device() * 3. PASID is torn down, after PASID cache is flushed. e.g. process * exit_mmap() due to crash * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where * VM has to free pages that were unmapped * 5. Userspace driver unmaps a DMA buffer * 6. Cache invalidation in vSVA usage (upcoming) * * For #1 and #2, device drivers are responsible for stopping DMA traffic * before unmap/unbind. For #3, iommu driver gets mmu_notifier to * invalidate TLB the same way as normal user unmap which will use this quirk. * The dTLB invalidation after PASID cache flush does not need this quirk. * * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
*/ void quirk_extra_dev_tlb_flush(struct device_domain_info *info, unsignedlong address, unsignedlong mask,
u32 pasid, u16 qdep)
{
u16 sid;
/* * Function to submit a command to the enhanced command interface. The * valid enhanced command descriptions are defined in Table 47 of the * VT-d spec. The VT-d hardware implementation may support some but not * all commands, which can be determined by checking the Enhanced * Command Capability Register. * * Return values: * - 0: Command successful without any error; * - Negative: software error value; * - Nonzero positive: failure status code defined in Table 48.
*/ int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
{ unsignedlong flags;
u64 res; int ret;
res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); if (res & DMA_ECMD_ECRSP_IP) {
ret = -EBUSY; goto err;
}
/* * Unconditionally write the operand B, because * - There is no side effect if an ecmd doesn't require an * operand B, but we set the register to some value. * - It's not invoked in any critical path. The extra MMIO * write doesn't bring any performance concerns.
*/
dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.