/* Setup queue base, control registers and default queue length */ #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ struct riscv_iommu_queue *_q = q; \
_q->qid = RISCV_IOMMU_INTR_ ## name; \
_q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
_q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
_q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
} while (0)
/* Note: offsets are the same for all queues */ #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) #define Q_ITEM(q, index) ((q)->mask & (index)) #define Q_IPSR(q) BIT((q)->qid)
/* * Discover queue ring buffer hardware configuration, allocate in-memory * ring buffer or use fixed I/O memory location, configure queue base register. * Must be called before hardware queue is enabled. * * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() * @entry_size - queue single element size in bytes.
*/ staticint riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, struct riscv_iommu_queue *queue,
size_t entry_size)
{ unsignedint logsz;
u64 qb, rb;
/* * Use WARL base register property to discover maximum allowed * number of entries and optional fixed IO address for queue location.
*/
riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
qb = riscv_iommu_readq(iommu, queue->qbr);
/* * Calculate and verify hardware supported queue length, as reported * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). * Update queue size based on hardware supported value.
*/
logsz = ilog2(queue->mask); if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
/* * Use WARL base register property to discover an optional fixed IO * address for queue ring buffer location. Otherwise allocate contiguous * system memory.
*/ if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { const size_t queue_size = entry_size << (logsz + 1);
/* Empty queue before enabling it */ if (queue->qid == RISCV_IOMMU_INTR_CQ)
riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0); else
riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);
/* * Enable queue with interrupts, clear any memory fault if any. * Wait for the hardware to acknowledge request and activate queue * processing. * Note: All CSR bitfields are in the same offsets for all queues.
*/
riscv_iommu_writel(iommu, queue->qcr,
RISCV_IOMMU_QUEUE_ENABLE |
RISCV_IOMMU_QUEUE_INTR_ENABLE |
RISCV_IOMMU_QUEUE_MEM_FAULT);
if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
queue->qid, csr);
queue->iommu = NULL;
}
/* * Returns number of available valid queue entries and the first item index. * Update shadow producer index if necessary.
*/ staticint riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, unsignedint *index)
{ unsignedint head = atomic_read(&queue->head); unsignedint tail = atomic_read(&queue->tail); unsignedint last = Q_ITEM(queue, tail); int available = (int)(tail - head);
/* Return actual consumer index based on hardware reported queue head index. */ staticunsignedint riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
{ constunsignedint cons = atomic_read(&queue->head); constunsignedint last = Q_ITEM(queue, cons); unsignedint head;
/* Enqueue an entry and wait to be processed if timeout_us > 0 * * Error handling for IOMMU hardware not responding in reasonable time * will be added as separate patch series along with other RAS features. * For now, only report hardware failure and continue.
*/ staticunsignedint riscv_iommu_queue_send(struct riscv_iommu_queue *queue, void *entry, size_t entry_size)
{ unsignedint prod; unsignedint head; unsignedint tail; unsignedlong flags;
/* Do not preempt submission flow. */
local_irq_save(flags);
/* 1. Allocate some space in the queue */
prod = atomic_inc_return(&queue->prod) - 1;
head = atomic_read(&queue->head);
/* 2. Wait for space availability. */ if ((prod - head) > queue->mask) { if (readx_poll_timeout(atomic_read, &queue->head,
head, (prod - head) < queue->mask,
0, RISCV_IOMMU_QUEUE_TIMEOUT)) goto err_busy;
} elseif ((prod - head) == queue->mask) { constunsignedint last = Q_ITEM(queue, head);
if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
!(head & ~queue->mask) && head != last,
0, RISCV_IOMMU_QUEUE_TIMEOUT)) goto err_busy;
atomic_add((head - last) & queue->mask, &queue->head);
}
/* 3. Store entry in the ring buffer */
memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
/* 4. Wait for all previous entries to be ready */ if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
0, RISCV_IOMMU_QUEUE_TIMEOUT)) goto err_busy;
/* * 5. Make sure the ring buffer update (whether in normal or I/O memory) is * completed and visible before signaling the tail doorbell to fetch * the next command. 'fence ow, ow'
*/
dma_wmb();
riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
/* * 6. Make sure the doorbell write to the device has finished before updating * the shadow tail index in normal memory. 'fence o, w'
*/
mmiowb();
atomic_inc(&queue->tail);
/* 7. Complete submission and restore local interrupts */
local_irq_restore(flags);
/* Make sure the mode is valid */ if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) return NULL;
/* Make sure device id is within range */
depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; if (devid >= (1 << ddi_bits[depth])) return NULL;
/* Get to the level of the non-leaf node that holds the device context */ for (ddtp = iommu->ddt_root; depth-- > 0;) { constint split = ddi_bits[depth]; /* * Each non-leaf node is 64bits wide and on each level * nodes are indexed by DDI[depth].
*/
ddtp += (devid >> split) & 0x1FF;
/* * Check if this node has been populated and if not * allocate a new level and populate it.
*/ do {
ddt = READ_ONCE(*(unsignedlong *)ddtp); if (ddt & RISCV_IOMMU_DDTE_V) {
ddtp = __va(ppn_to_phys(ddt)); break;
}
ptr = riscv_iommu_get_pages(iommu, SZ_4K); if (!ptr) return NULL;
new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
old = cmpxchg_relaxed((unsignedlong *)ddtp, ddt, new);
if (old == ddt) {
ddtp = (u64 *)ptr; break;
}
/* Race setting DDT detected, re-read and retry. */
riscv_iommu_free_pages(iommu, ptr);
} while (1);
}
/* * Grab the node that matches DDI[depth], note that when using base * format the device context is 4 * 64bits, and the extended format * is 8 * 64bits, hence the (3 - base_format) below.
*/
ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
return (struct riscv_iommu_dc *)ddtp;
}
/* * This is best effort IOMMU translation shutdown flow. * Disable IOMMU without waiting for hardware response.
*/ void riscv_iommu_disable(struct riscv_iommu_device *iommu)
{
riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
}
ddtp = riscv_iommu_read_ddtp(iommu); if (ddtp & RISCV_IOMMU_DDTP_BUSY) return -EBUSY;
/* * It is optional for the hardware to report a fixed address for device * directory root page when DDT.MODE is OFF or BARE.
*/
mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { /* Use WARL to discover hardware fixed DDT PPN */
riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
ddtp = riscv_iommu_read_ddtp(iommu); if (ddtp & RISCV_IOMMU_DDTP_BUSY) return -EBUSY;
iommu->ddt_phys = ppn_to_phys(ddtp); if (iommu->ddt_phys)
iommu->ddt_root = devm_ioremap(iommu->dev,
iommu->ddt_phys, PAGE_SIZE); if (iommu->ddt_root)
memset(iommu->ddt_root, 0, PAGE_SIZE);
}
/* Hardware mandatory DDTP mode has not been accepted. */ if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
ddtp, rq_ddtp); return -EINVAL;
}
/* * Mode field is WARL, an IOMMU may support a subset of * directory table levels in which case if we tried to set * an unsupported number of levels we'll readback either * a valid xLVL or off/bare. If we got off/bare, try again * with a smaller xLVL.
*/ if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
rq_mode--; continue;
}
/* * We tried all supported modes and IOMMU hardware failed to * accept new settings, something went very wrong since off/bare * and at least one xLVL must be supported.
*/
dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
mode, ddtp_mode); return -EINVAL;
} while (1);
/* Private IOMMU data for managed devices, dev_iommu_priv_* */ struct riscv_iommu_info { struct riscv_iommu_domain *domain;
};
/* * Linkage between an iommu_domain and attached devices. * * Protection domain requiring IOATC and DevATC translation cache invalidations, * should be linked to attached devices using a riscv_iommu_bond structure. * Devices should be linked to the domain before first use and unlinked after * the translations from the referenced protection domain can no longer be used. * Blocking and identity domains are not tracked here, as the IOMMU hardware * does not cache negative and/or identity (BARE mode) translations, and DevATC * is disabled for those protection domains. * * The device pointer and IOMMU data remain stable in the bond struct after * _probe_device() where it's attached to the managed IOMMU, up to the * completion of the _release_device() call. The release of the bond structure * is synchronized with the device release.
*/ struct riscv_iommu_bond { struct list_head list; struct rcu_head rcu; struct device *dev;
};
spin_lock(&domain->lock);
list_for_each_entry(bond, &domain->bonds, list) { if (found && count) break; elseif (bond->dev == dev)
found = bond; elseif (dev_to_iommu(bond->dev) == iommu)
count++;
} if (found)
list_del_rcu(&found->list);
spin_unlock(&domain->lock);
kfree_rcu(found, rcu);
/* * If this was the last bond between this domain and the IOMMU * invalidate all cached entries for domain's PSCID.
*/ if (!count) {
riscv_iommu_cmd_inval_vma(&cmd);
riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
riscv_iommu_cmd_send(iommu, &cmd);
/* * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. * This limit will be replaced with range invalidations, if supported by * the hardware, when RISC-V IOMMU architecture specification update for * range invalidations update will be available.
*/ #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
/* * For each IOMMU linked with this protection domain (via bonds->dev), * an IOTLB invaliation command will be submitted and executed. * * Possbile race with domain attach flow is handled by sequencing * bond creation - riscv_iommu_bond_link(), and device directory * update - riscv_iommu_iodir_update(). * * PTE Update / IOTLB Inval Device attach & directory update * -------------------------- -------------------------- * update page table entries add dev to the bond list * FENCE RW,RW FENCE RW,RW * For all IOMMUs: (can be empty) Update FSC/PSCID * FENCE IOW,IOW FENCE IOW,IOW * IOTLB.INVAL IODIR.INVAL * IOFENCE.C * * If bond list is not updated with new device, directory context will * be configured with already valid page table content. If an IOMMU is * linked to the protection domain it will receive invalidation * requests for updated page table entries.
*/
smp_mb();
/* * IOTLB invalidation request can be safely omitted if already sent * to the IOMMU for the same PSCID, and with domain->bonds list * arranged based on the device's IOMMU, it's sufficient to check * last device the invalidation was sent to.
*/ if (iommu == prev) continue;
/* * Update IODIR for the device. * * During the execution of riscv_iommu_probe_device(), IODIR entries are * allocated for the device's identifiers. Device context invalidation * becomes necessary only if one of the updated entries was previously * marked as valid, given that invalid device context entries are not * cached by the IOMMU hardware. * In this implementation, updating a valid device context while the * device is not quiesced might be disruptive, potentially causing * interim translation faults.
*/ staticvoid riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, struct device *dev, u64 fsc, u64 ta)
{ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_dc *dc; struct riscv_iommu_command cmd; bool sync_required = false;
u64 tc; int i;
for (i = 0; i < fwspec->num_ids; i++) {
dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
tc = READ_ONCE(dc->tc); if (!(tc & RISCV_IOMMU_DC_TC_V)) continue;
if (sync_required)
riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
/* * For device context with DC_TC_PDTV = 0, translation attributes valid bit * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
*/ for (i = 0; i < fwspec->num_ids; i++) {
dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
tc = READ_ONCE(dc->tc);
tc |= ta & RISCV_IOMMU_DC_TC_V;
WRITE_ONCE(dc->fsc, fsc);
WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); /* Update device context, write TC.V as the last step. */
dma_wmb();
WRITE_ONCE(dc->tc, tc);
/* Recursively free all sub page table pages */ for (i = 0; i < PTRS_PER_PTE; i++) {
pte = READ_ONCE(ptr[i]); if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
riscv_iommu_pte_free(domain, pte, freelist);
}
if (freelist)
iommu_pages_list_add(freelist, ptr); else
iommu_free_pages(ptr);
}
do { constint shift = PAGE_SHIFT + PT_SHIFT * level;
ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); /* * Note: returned entry might be a non-leaf if there was * existing mapping with smaller granularity. Up to the caller * to replace and invalidate.
*/ if (((size_t)1 << shift) == pgsize) return ptr;
pte_retry:
pte = READ_ONCE(*ptr); /* * This is very likely incorrect as we should not be adding * new mapping with smaller granularity on top * of existing 2M/1G mapping. Fail.
*/ if (_io_pte_present(pte) && _io_pte_leaf(pte)) return NULL; /* * Non-leaf entry is missing, allocate and try to add to the * page table. This might race with other mappings, retry.
*/ if (_io_pte_none(pte)) {
addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp,
SZ_4K); if (!addr) return NULL;
old = pte;
pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); if (cmpxchg_relaxed(ptr, old, pte) != old) {
iommu_free_pages(addr); goto pte_retry;
}
}
ptr = (unsignedlong *)pfn_to_virt(__page_val_to_pfn(pte));
} while (level-- > 0);
if (!iommu_pages_list_empty(&freelist)) { /* * In 1.0 spec version, the smallest scope we can use to * invalidate all levels of page table (i.e. leaf and non-leaf) * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. * This will be updated with hardware support for * capability.NL (non-leaf) IOTINVAL command.
*/
riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
iommu_put_pages_list(&freelist);
}
/* * Note: RISC-V Privilege spec mandates that virtual addresses * need to be sign-extended, so if (VA_BITS - 1) is set, all * bits >= VA_BITS need to also be set or else we'll get a * page fault. However the code that creates the mappings * above us (e.g. iommu_dma_alloc_iova()) won't do that for us * for now, so we'll end up with invalid virtual addresses * to map. As a workaround until we get this sorted out * limit the available virtual addresses to VA_BITS - 1.
*/
va_mask = DMA_BIT_MASK(va_bits - 1);
if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) return ERR_PTR(-ENODEV);
iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); if (!iommu) return ERR_PTR(-ENODEV);
/* * IOMMU hardware operating in fail-over BARE mode will provide * identity translation for all connected devices anyway...
*/ if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) return ERR_PTR(-ENODEV);
info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); /* * Allocate and pre-configure device context entries in * the device directory. Do not mark the context valid yet.
*/
tc = 0; if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
tc |= RISCV_IOMMU_DC_TC_SADE; for (i = 0; i < fwspec->num_ids; i++) {
dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); if (!dc) {
kfree(info); return ERR_PTR(-ENODEV);
} if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
dev_warn(dev, "already attached to IOMMU device directory\n");
WRITE_ONCE(dc->tc, tc);
}
/* * Make sure the IOMMU is switched off or in pass-through mode during * regular boot flow and disable translation when we boot into a kexec * kernel and the previous kernel left them enabled.
*/
ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); if (ddtp & RISCV_IOMMU_DDTP_BUSY) return -EBUSY;
if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { if (!is_kdump_kernel()) return -EBUSY;
riscv_iommu_disable(iommu);
}
/* Configure accesses to in-memory data structures for CPU-native byte order. */ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
!!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) return -EINVAL;
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
!!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) return -EINVAL;
}
/* * Distribute interrupt vectors, always use first vector for CIV. * At least one interrupt is required. Read back and verify.
*/ if (!iommu->irqs_count) return -EINVAL;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.