/* * We precalculate the hash to avoid doing it on every allocation. * * The hash is important to spread CPUs across all the pools. For example, * on a POWER7 with 4 way SMT we want interrupts on the primary threads and * with 4 pools all primary threads would map to the same pool.
*/ staticint __init setup_iommu_pool_hash(void)
{ unsignedint i;
if (action == BUS_NOTIFY_ADD_DEVICE) { if (device_create_file(dev, &dev_attr_fail_iommu))
pr_warn("Unable to create IOMMU fault injection sysfs " "entries\n");
} elseif (action == BUS_NOTIFY_DEL_DEVICE) {
device_remove_file(dev, &dev_attr_fail_iommu);
}
return 0;
}
/* * PCI and VIO buses need separate notifier_block structs, since they're linked * list nodes. Sharing a notifier_block would mean that any notifiers later * registered for PCI buses would also get called by VIO buses and vice versa.
*/ staticstruct notifier_block fail_iommu_pci_bus_notifier = {
.notifier_call = fail_iommu_bus_notify
};
return 0;
} /* * Must execute after PCI and VIO subsystem have initialised but before * devices are probed.
*/
arch_initcall(fail_iommu_setup); #else staticinlinebool should_fail_iommu(struct device *dev)
{ returnfalse;
} #endif
/* This allocator was derived from x86_64's bit string search */
/* Sanity check */ if (unlikely(npages == 0)) { if (printk_ratelimit())
WARN_ON(1); return DMA_MAPPING_ERROR;
}
if (should_fail_iommu(dev)) return DMA_MAPPING_ERROR;
/* * We don't need to disable preemption here because any CPU can * safely use any IOMMU pool.
*/
pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
if (largealloc)
pool = &(tbl->large_pool); else
pool = &(tbl->pools[pool_nr]);
/* The case below can happen if we have a small segment appended * to a large, or when the previous alloc was at the very end of * the available space. If so, go back to the initial start.
*/ if (start >= limit)
start = pool->start;
if (limit + tbl->it_offset > mask) {
limit = mask - tbl->it_offset + 1; /* If we're constrained on address range, first try * at the masked hint to avoid O(n) search complexity, * but on second pass, start at 0 in pool 0.
*/ if ((start & mask) >= limit || pass > 0) {
spin_unlock(&(pool->lock));
pool = &(tbl->pools[0]);
spin_lock(&(pool->lock));
start = pool->start;
} else {
start &= mask;
}
}
n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift),
align_mask); if (n == -1) { if (likely(pass == 0)) { /* First try the pool from the start */
pool->hint = pool->start;
pass++; goto again;
} elseif (pass <= tbl->nr_pools) { /* Now try scanning all the other pools */
spin_unlock(&(pool->lock));
pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
pool = &tbl->pools[pool_nr];
spin_lock(&(pool->lock));
pool->hint = pool->start;
pass++; goto again;
} else { /* Give up */
spin_unlock_irqrestore(&(pool->lock), flags); return DMA_MAPPING_ERROR;
}
}
end = n + npages;
/* Bump the hint to a new block for small allocs. */ if (largealloc) { /* Don't bump to new block to avoid fragmentation */
pool->hint = end;
} else { /* Overflow will be taken care of at the next allocation */
pool->hint = (end + tbl->it_blocksize - 1) &
~(tbl->it_blocksize - 1);
}
/* Update handle for SG allocations */ if (handle)
*handle = end;
if (unlikely(entry == DMA_MAPPING_ERROR)) return DMA_MAPPING_ERROR;
entry += tbl->it_offset; /* Offset into real TCE table */
ret = entry << tbl->it_page_shift; /* Set the return dma address */
/* Put the TCEs in the HW table */
build_fail = tbl->it_ops->set(tbl, entry, npages,
(unsignedlong)page &
IOMMU_PAGE_MASK(tbl), direction, attrs);
/* tbl->it_ops->set() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return * DMA_MAPPING_ERROR. For all other errors the functionality is * not altered.
*/ if (unlikely(build_fail)) {
__iommu_free(tbl, ret, npages); return DMA_MAPPING_ERROR;
}
/* Flush/invalidate TLB caches if necessary */ if (tbl->it_ops->flush)
tbl->it_ops->flush(tbl);
/* Make sure updates are seen by hardware */
mb();
/* The large pool is the last pool at the top of the table */ if (entry >= largepool_start) {
p = &tbl->large_pool;
} else { unsignedint pool_nr = entry / tbl->poolsize;
BUG_ON(pool_nr > tbl->nr_pools);
p = &tbl->pools[pool_nr];
}
/* Make sure TLB cache is flushed if the HW needs it. We do * not do an mb() here on purpose, it is not needed on any of * the current platforms.
*/ if (tbl->it_ops->flush)
tbl->it_ops->flush(tbl);
}
/* If we are in an open segment, try merging */ if (segstart != s) {
DBG(" - trying merge...\n"); /* We cannot merge if: * - allocated dma_addr isn't contiguous to previous allocation
*/ if (novmerge || (dma_addr != dma_next) ||
(outs->dma_length + s->length > max_seg_size)) { /* Can't merge: create a new segment */
segstart = s;
outcount++;
outs = sg_next(outs);
DBG(" can't merge, new segment.\n");
} else {
outs->dma_length += s->length;
DBG(" merged, new len: %ux\n", outs->dma_length);
}
}
if (segstart == s) { /* This is a new segment, fill entries */
DBG(" - filling new segment.\n");
outs->dma_address = dma_addr;
outs->dma_length = slen;
}
/* Calculate next page pointer for contiguous check */
dma_next = dma_addr + slen;
DBG(" - dma next is: %lx\n", dma_next);
}
/* Flush/invalidate TLB caches if necessary */ if (tbl->it_ops->flush)
tbl->it_ops->flush(tbl);
DBG("mapped %d elements:\n", outcount);
/* For the sake of ppc_iommu_unmap_sg, we clear out the length in the * next entry of the sglist if we didn't fill the list completely
*/ if (outcount < incount) {
outs = sg_next(outs);
outs->dma_length = 0;
}
/* Make sure updates are seen by hardware */
mb();
/* Flush/invalidate TLBs if necessary. As for iommu_free(), we * do not do an mb() here, the affected platforms do not need it * when freeing.
*/ if (tbl->it_ops->flush)
tbl->it_ops->flush(tbl);
}
void iommu_table_clear(struct iommu_table *tbl)
{ /* * In case of firmware assisted dump system goes through clean * reboot process at the time of system crash. Hence it's safe to * clear the TCE entries if firmware assisted dump is active.
*/ if (!is_kdump_kernel() || is_fadump_active()) { /* Clear the table in case firmware left allocations in it */
tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size); return;
}
/* Reserve the existing mappings left by the first kernel. */ for (index = 0; index < tbl->it_size; index++) {
tceval = tbl->it_ops->get(tbl, index + tbl->it_offset); /* * Freed TCE entry contains 0x7fffffffffffffff on JS20
*/ if (tceval && (tceval != 0x7fffffffffffffffUL)) {
__set_bit(index, tbl->it_map);
tcecount++;
}
}
if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
printk(KERN_WARNING "TCE table is full; freeing ");
printk(KERN_WARNING "%d entries for the kdump boot\n",
KDUMP_MIN_TCE_ENTRIES); for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
index < tbl->it_size; index++)
__clear_bit(index, tbl->it_map);
}
} #endif
}
WARN_ON_ONCE(res_end < res_start); /* * Reserve page 0 so it will not be used for any mappings. * This avoids buggy drivers that consider page 0 to be invalid * to crash the machine or even lose data.
*/ if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
if (res_start < tbl->it_offset)
res_start = tbl->it_offset;
/* Check if res_start..res_end is a valid range in the table */ if (res_start >= res_end) {
tbl->it_reserved_start = tbl->it_offset;
tbl->it_reserved_end = tbl->it_offset; return;
}
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
set_bit(i - tbl->it_offset, tbl->it_map);
}
/* * Build a iommu_table structure. This contains a bit map which * is used to manage allocation of the tce space.
*/ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, unsignedlong res_start, unsignedlong res_end)
{ unsignedlong sz; staticint welcomed = 0; unsignedint i; struct iommu_pool *p;
BUG_ON(!tbl->it_ops);
/* number of bytes needed for the bitmap */
sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsignedlong);
/* We only split the IOMMU table if we have 1GB or more of space */ if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
tbl->nr_pools = IOMMU_NR_POOLS; else
tbl->nr_pools = 1;
/* We reserve the top 1/4 of the table for large allocations */
tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
for (i = 0; i < tbl->nr_pools; i++) {
p = &tbl->pools[i];
spin_lock_init(&(p->lock));
p->start = tbl->poolsize * i;
p->hint = p->start;
p->end = p->start + tbl->poolsize;
}
/* Simple case with no reserved MMIO32 region */ if (!tbl->it_reserved_start && !tbl->it_reserved_end) return find_next_bit(tbl->it_map, tbl->it_size, start) != tbl->it_size;
end = tbl->it_reserved_start - tbl->it_offset; if (find_next_bit(tbl->it_map, end, start) != end) returntrue;
/* Creates TCEs for a user provided buffer. The user buffer must be * contiguous real kernel storage (not vmalloc). The address passed here * comprises a page address and offset into that page. The dma_addr_t * returned will point to the same byte within the page as was passed in.
*/
dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, struct page *page, unsignedlong offset, size_t size, unsignedlong mask, enum dma_data_direction direction, unsignedlong attrs)
{
dma_addr_t dma_handle = DMA_MAPPING_ERROR; void *vaddr; unsignedlong uaddr; unsignedint npages, align;
/* Allocates a contiguous real buffer and creates mappings over it. * Returns the virtual address of the buffer and sets dma_handle * to the dma address (mapping) of the first page.
*/ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
size_t size, dma_addr_t *dma_handle, unsignedlong mask, gfp_t flag, int node)
{ void *ret = NULL;
dma_addr_t mapping; unsignedint order; unsignedint nio_pages, io_order; struct page *page; int tcesize = (1 << tbl->it_page_shift);
size = PAGE_ALIGN(size);
order = get_order(size);
/* * Client asked for way too much space. This is checked later * anyway. It is easier to debug here for the drivers than in * the tce tables.
*/ if (order >= IOMAP_MAX_ORDER) {
dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
size); return NULL;
}
int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
{ /* * The sysfs entries should be populated before * binding IOMMU group. If sysfs entries isn't * ready, we simply bail.
*/ if (!device_is_registered(dev)) return -ENOENT;
if (device_iommu_mapped(dev)) {
pr_debug("%s: Skipping device %s with iommu group %d\n",
__func__, dev_name(dev),
iommu_group_id(dev->iommu_group)); return -EBUSY;
}
pr_debug("%s: Adding %s to iommu group %d\n",
__func__, dev_name(dev), iommu_group_id(table_group->group)); /* * This is still not adding devices via the IOMMU bus notifier because * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls * pcibios_scan_phb() first (and this guy adds devices and triggers * the notifier) and only then it calls pci_bus_add_devices() which * configures DMA for buses which also creates PEs and IOMMU groups.
*/ return iommu_probe_device(dev);
}
EXPORT_SYMBOL_GPL(iommu_add_device);
#ifdefined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * A simple iommu_ops to allow less cruft in generic VFIO code.
*/ staticint
spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, struct device *dev)
{ struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_table_group *table_group; struct iommu_group *grp;
/* At first attach the ownership is already set */ if (!domain) return 0;
grp = iommu_group_get(dev);
table_group = iommu_group_get_iommudata(grp); /* * The domain being set to PLATFORM from earlier * BLOCKED. The table_group ownership has to be released.
*/
table_group->ops->release_ownership(table_group, dev);
iommu_group_put(grp);
/* * This registers IOMMU devices of PHBs. This needs to happen * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and * before subsys_initcall(iommu_subsys_init).
*/ staticint __init spapr_tce_setup_phb_iommus_initcall(void)
{ struct pci_controller *hose;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.