Quelle iommu.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
* IOMMU API for RISC-V IOMMU implementations.
*
* Copyright © 2022-2024 Rivos Inc.
* Copyright © 2023 FORTH-ICS/CARV
*
* Authors
* Tomasz Jeznach <tjeznach@rivosinc.com>
* Nick Kossifidis <mick@ics.forth.gr>
*/

#define pr_fmt(fmt) "riscv-iommu: " fmt

#include <linux/compiler.h>
#include <linux/crash_dump.h>
#include <linux/init.h>
#include <linux/iommu.h>
#include <linux/iopoll.h>
#include <linux/kernel.h>
#include <linux/pci.h>

#include "../iommu-pages.h"
#include "iommu-bits.h"
#include "iommu.h"

/* Timeouts in [us] */
#define RISCV_IOMMU_QCSR_TIMEOUT 150000
#define RISCV_IOMMU_QUEUE_TIMEOUT 150000
#define RISCV_IOMMU_DDTP_TIMEOUT 10000000
#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000

/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
#define RISCV_IOMMU_DEF_CQ_COUNT 8192
#define RISCV_IOMMU_DEF_FQ_COUNT 4096

/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
#define phys_to_ppn(pa)  (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
#define ppn_to_phys(pn)  (((pn) << 2) & (((1ULL << 44) - 1) << 12))

#define dev_to_iommu(dev) \
iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)

/* IOMMU PSCID allocation namespace. */
static DEFINE_IDA(riscv_iommu_pscids);
#define RISCV_IOMMU_MAX_PSCID  (BIT(20) - 1)

/* Device resource-managed allocations */
struct riscv_iommu_devres {
void *addr;
};

static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
{
struct riscv_iommu_devres *devres = res;

iommu_free_pages(devres->addr);
}

static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
{
struct riscv_iommu_devres *devres = res;
struct riscv_iommu_devres *target = p;

return devres->addr == target->addr;
}

static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu,
       unsigned int size)
{
struct riscv_iommu_devres *devres;
void *addr;

addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev),
      GFP_KERNEL_ACCOUNT, size);
if (unlikely(!addr))
  return NULL;

devres = devres_alloc(riscv_iommu_devres_pages_release,
         sizeof(struct riscv_iommu_devres), GFP_KERNEL);

if (unlikely(!devres)) {
  iommu_free_pages(addr);
  return NULL;
}

devres->addr = addr;

devres_add(iommu->dev, devres);

return addr;
}

static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
{
struct riscv_iommu_devres devres = { .addr = addr };

devres_release(iommu->dev, riscv_iommu_devres_pages_release,
         riscv_iommu_devres_pages_match, &devres);
}

/*
* Hardware queue allocation and management.
*/

/* Setup queue base, control registers and default queue length */
#define RISCV_IOMMU_QUEUE_INIT(q, name) do {    \
struct riscv_iommu_queue *_q = q;    \
_q->qid = RISCV_IOMMU_INTR_ ## name;    \
_q->qbr = RISCV_IOMMU_REG_ ## name ## B;   \
_q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;   \
_q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
} while (0)

/* Note: offsets are the same for all queues */
#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
#define Q_ITEM(q, index) ((q)->mask & (index))
#define Q_IPSR(q) BIT((q)->qid)

/*
* Discover queue ring buffer hardware configuration, allocate in-memory
* ring buffer or use fixed I/O memory location, configure queue base register.
* Must be called before hardware queue is enabled.
*
* @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
* @entry_size - queue single element size in bytes.
*/
static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
       struct riscv_iommu_queue *queue,
       size_t entry_size)
{
unsigned int logsz;
u64 qb, rb;

/*
* Use WARL base register property to discover maximum allowed
* number of entries and optional fixed IO address for queue location.
*/
riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
qb = riscv_iommu_readq(iommu, queue->qbr);

/*
* Calculate and verify hardware supported queue length, as reported
* by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
* Update queue size based on hardware supported value.
*/
logsz = ilog2(queue->mask);
if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
  logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);

/*
* Use WARL base register property to discover an optional fixed IO
* address for queue ring buffer location. Otherwise allocate contiguous
* system memory.
*/
if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
  const size_t queue_size = entry_size << (logsz + 1);

  queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
  queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
} else {
  do {
   const size_t queue_size = entry_size << (logsz + 1);

   queue->base = riscv_iommu_get_pages(
    iommu, max(queue_size, SZ_4K));
   queue->phys = __pa(queue->base);
  } while (!queue->base && logsz-- > 0);
}

if (!queue->base)
  return -ENOMEM;

qb = phys_to_ppn(queue->phys) |
      FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);

/* Update base register and read back to verify hw accepted our write */
riscv_iommu_writeq(iommu, queue->qbr, qb);
rb = riscv_iommu_readq(iommu, queue->qbr);
if (rb != qb) {
  dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
  return -ENODEV;
}

/* Update actual queue mask */
queue->mask = (2U << logsz) - 1;

dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
  queue->qid, logsz + 1);

return 0;
}

/* Check interrupt queue status, IPSR */
static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
{
struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;

if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
  return IRQ_WAKE_THREAD;

return IRQ_NONE;
}

static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
{
/* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
}

/*
* Enable queue processing in the hardware, register interrupt handler.
*
* @queue - data structure, already allocated with riscv_iommu_queue_alloc()
* @irq_handler - threaded interrupt handler.
*/
static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
        struct riscv_iommu_queue *queue,
        irq_handler_t irq_handler)
{
const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
u32 csr;
int rc;

if (queue->iommu)
  return -EBUSY;

/* Polling not implemented */
if (!irq)
  return -ENODEV;

queue->iommu = iommu;
rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
      IRQF_ONESHOT | IRQF_SHARED,
      dev_name(iommu->dev), queue);
if (rc) {
  queue->iommu = NULL;
  return rc;
}

/* Empty queue before enabling it */
if (queue->qid == RISCV_IOMMU_INTR_CQ)
  riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0);
else
  riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);

/*
* Enable queue with interrupts, clear any memory fault if any.
* Wait for the hardware to acknowledge request and activate queue
* processing.
* Note: All CSR bitfields are in the same offsets for all queues.
*/
riscv_iommu_writel(iommu, queue->qcr,
      RISCV_IOMMU_QUEUE_ENABLE |
      RISCV_IOMMU_QUEUE_INTR_ENABLE |
      RISCV_IOMMU_QUEUE_MEM_FAULT);

riscv_iommu_readl_timeout(iommu, queue->qcr,
      csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
      10, RISCV_IOMMU_QCSR_TIMEOUT);

if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
      RISCV_IOMMU_QUEUE_BUSY |
      RISCV_IOMMU_QUEUE_MEM_FAULT))) {
  /* Best effort to stop and disable failing hardware queue. */
  riscv_iommu_writel(iommu, queue->qcr, 0);
  free_irq(irq, queue);
  queue->iommu = NULL;
  dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
  return -EBUSY;
}

/* Clear any pending interrupt flag. */
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));

return 0;
}

/*
* Disable queue. Wait for the hardware to acknowledge request and
* stop processing enqueued requests. Report errors but continue.
*/
static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
{
struct riscv_iommu_device *iommu = queue->iommu;
u32 csr;

if (!iommu)
  return;

free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
riscv_iommu_writel(iommu, queue->qcr, 0);
riscv_iommu_readl_timeout(iommu, queue->qcr,
      csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
      10, RISCV_IOMMU_QCSR_TIMEOUT);

if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
  dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
   queue->qid, csr);

queue->iommu = NULL;
}

/*
* Returns number of available valid queue entries and the first item index.
* Update shadow producer index if necessary.
*/
static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
         unsigned int *index)
{
unsigned int head = atomic_read(&queue->head);
unsigned int tail = atomic_read(&queue->tail);
unsigned int last = Q_ITEM(queue, tail);
int available = (int)(tail - head);

*index = head;

if (available > 0)
  return available;

/* read hardware producer index, check reserved register bits are not set. */
if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
          tail, (tail & ~queue->mask) == 0,
          0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
  dev_err_once(queue->iommu->dev,
        "Hardware error: queue access timeout\n");
  return 0;
}

if (tail == last)
  return 0;

/* update shadow producer index */
return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
}

/*
* Release processed queue entries, should match riscv_iommu_queue_consume() calls.
*/
static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
{
const unsigned int head = atomic_add_return(count, &queue->head);

riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
}

/* Return actual consumer index based on hardware reported queue head index. */
static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
{
const unsigned int cons = atomic_read(&queue->head);
const unsigned int last = Q_ITEM(queue, cons);
unsigned int head;

if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
          !(head & ~queue->mask),
          0, RISCV_IOMMU_QUEUE_TIMEOUT))
  return cons;

return cons + ((head - last) & queue->mask);
}

/* Wait for submitted item to be processed. */
static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
      unsigned int index,
      unsigned int timeout_us)
{
unsigned int cons = atomic_read(&queue->head);

/* Already processed by the consumer */
if ((int)(cons - index) > 0)
  return 0;

/* Monitor consumer index */
return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
     (int)(cons - index) > 0, 0, timeout_us);
}

/* Enqueue an entry and wait to be processed if timeout_us > 0
*
* Error handling for IOMMU hardware not responding in reasonable time
* will be added as separate patch series along with other RAS features.
* For now, only report hardware failure and continue.
*/
static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
        void *entry, size_t entry_size)
{
unsigned int prod;
unsigned int head;
unsigned int tail;
unsigned long flags;

/* Do not preempt submission flow. */
local_irq_save(flags);

/* 1. Allocate some space in the queue */
prod = atomic_inc_return(&queue->prod) - 1;
head = atomic_read(&queue->head);

/* 2. Wait for space availability. */
if ((prod - head) > queue->mask) {
  if (readx_poll_timeout(atomic_read, &queue->head,
           head, (prod - head) < queue->mask,
           0, RISCV_IOMMU_QUEUE_TIMEOUT))
   goto err_busy;
} else if ((prod - head) == queue->mask) {
  const unsigned int last = Q_ITEM(queue, head);

  if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
           !(head & ~queue->mask) && head != last,
           0, RISCV_IOMMU_QUEUE_TIMEOUT))
   goto err_busy;
  atomic_add((head - last) & queue->mask, &queue->head);
}

/* 3. Store entry in the ring buffer */
memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);

/* 4. Wait for all previous entries to be ready */
if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
          0, RISCV_IOMMU_QUEUE_TIMEOUT))
  goto err_busy;

/*
* 5. Make sure the ring buffer update (whether in normal or I/O memory) is
*    completed and visible before signaling the tail doorbell to fetch
*    the next command. 'fence ow, ow'
*/
dma_wmb();
riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));

/*
* 6. Make sure the doorbell write to the device has finished before updating
*    the shadow tail index in normal memory. 'fence o, w'
*/
mmiowb();
atomic_inc(&queue->tail);

/* 7. Complete submission and restore local interrupts */
local_irq_restore(flags);

return prod;

err_busy:
local_irq_restore(flags);
dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");

return prod;
}

/*
* IOMMU Command queue chapter 3.1
*/

/* Command queue interrupt handler thread function */
static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
{
const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
unsigned int ctrl;

/* Clear MF/CQ errors, complete error recovery to be implemented. */
ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
      RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
  riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
  dev_warn(queue->iommu->dev,
    "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
    queue->qid,
    !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
    !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
    !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
}

/* Placeholder for command queue interrupt notifiers */

/* Clear command interrupt pending. */
riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));

return IRQ_HANDLED;
}

/* Send command to the IOMMU command queue */
static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
     struct riscv_iommu_command *cmd)
{
riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
}

/* Send IOFENCE.C command and wait for all scheduled commands to complete. */
static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
     unsigned int timeout_us)
{
struct riscv_iommu_command cmd;
unsigned int prod;

riscv_iommu_cmd_iofence(&cmd);
prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));

if (!timeout_us)
  return;

if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
  dev_err_once(iommu->dev,
        "Hardware error: command execution timeout\n");
}

/*
* IOMMU Fault/Event queue chapter 3.2
*/

static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
         struct riscv_iommu_fq_record *event)
{
unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);

/* Placeholder for future fault handling implementation, report only. */
if (err)
  dev_warn_ratelimited(iommu->dev,
         "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
         err, devid, event->iotval, event->iotval2);
}

/* Fault queue interrupt handler thread function */
static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
{
struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
struct riscv_iommu_device *iommu = queue->iommu;
struct riscv_iommu_fq_record *events;
unsigned int ctrl, idx;
int cnt, len;

events = (struct riscv_iommu_fq_record *)queue->base;

/* Clear fault interrupt pending and process all received fault events. */
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));

do {
  cnt = riscv_iommu_queue_consume(queue, &idx);
  for (len = 0; len < cnt; idx++, len++)
   riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
  riscv_iommu_queue_release(queue, cnt);
} while (cnt > 0);

/* Clear MF/OF errors, complete error recovery to be implemented. */
ctrl = riscv_iommu_readl(iommu, queue->qcr);
if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
  riscv_iommu_writel(iommu, queue->qcr, ctrl);
  dev_warn(iommu->dev,
    "Queue #%u error; memory fault:%d overflow:%d\n",
    queue->qid,
    !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
    !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
}

return IRQ_HANDLED;
}

/* Lookup and initialize device context info structure. */
static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
       unsigned int devid)
{
const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
unsigned int depth;
unsigned long ddt, old, new;
void *ptr;
u8 ddi_bits[3] = { 0 };
u64 *ddtp = NULL;

/* Make sure the mode is valid */
if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
     iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
  return NULL;

/*
* Device id partitioning for base format:
* DDI[0]: bits 0 - 6   (1st level) (7 bits)
* DDI[1]: bits 7 - 15  (2nd level) (9 bits)
* DDI[2]: bits 16 - 23 (3rd level) (8 bits)
*
* For extended format:
* DDI[0]: bits 0 - 5   (1st level) (6 bits)
* DDI[1]: bits 6 - 14  (2nd level) (9 bits)
* DDI[2]: bits 15 - 23 (3rd level) (9 bits)
*/
if (base_format) {
  ddi_bits[0] = 7;
  ddi_bits[1] = 7 + 9;
  ddi_bits[2] = 7 + 9 + 8;
} else {
  ddi_bits[0] = 6;
  ddi_bits[1] = 6 + 9;
  ddi_bits[2] = 6 + 9 + 9;
}

/* Make sure device id is within range */
depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
if (devid >= (1 << ddi_bits[depth]))
  return NULL;

/* Get to the level of the non-leaf node that holds the device context */
for (ddtp = iommu->ddt_root; depth-- > 0;) {
  const int split = ddi_bits[depth];
  /*
* Each non-leaf node is 64bits wide and on each level
* nodes are indexed by DDI[depth].
*/
  ddtp += (devid >> split) & 0x1FF;

  /*
* Check if this node has been populated and if not
* allocate a new level and populate it.
*/
  do {
   ddt = READ_ONCE(*(unsigned long *)ddtp);
   if (ddt & RISCV_IOMMU_DDTE_V) {
    ddtp = __va(ppn_to_phys(ddt));
    break;
   }

   ptr = riscv_iommu_get_pages(iommu, SZ_4K);
   if (!ptr)
    return NULL;

   new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
   old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);

   if (old == ddt) {
    ddtp = (u64 *)ptr;
    break;
   }

   /* Race setting DDT detected, re-read and retry. */
   riscv_iommu_free_pages(iommu, ptr);
  } while (1);
}

/*
* Grab the node that matches DDI[depth], note that when using base
* format the device context is 4 * 64bits, and the extended format
* is 8 * 64bits, hence the (3 - base_format) below.
*/
ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);

return (struct riscv_iommu_dc *)ddtp;
}

/*
* This is best effort IOMMU translation shutdown flow.
* Disable IOMMU without waiting for hardware response.
*/
void riscv_iommu_disable(struct riscv_iommu_device *iommu)
{
riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
      FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
          RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
}

#define riscv_iommu_read_ddtp(iommu) ({ \
u64 ddtp; \
riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
      !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
      RISCV_IOMMU_DDTP_TIMEOUT); \
ddtp; })

static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
{
u64 ddtp;
unsigned int mode;

ddtp = riscv_iommu_read_ddtp(iommu);
if (ddtp & RISCV_IOMMU_DDTP_BUSY)
  return -EBUSY;

/*
* It is optional for the hardware to report a fixed address for device
* directory root page when DDT.MODE is OFF or BARE.
*/
mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
     mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
  /* Use WARL to discover hardware fixed DDT PPN */
  riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
       FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
  ddtp = riscv_iommu_read_ddtp(iommu);
  if (ddtp & RISCV_IOMMU_DDTP_BUSY)
   return -EBUSY;

  iommu->ddt_phys = ppn_to_phys(ddtp);
  if (iommu->ddt_phys)
   iommu->ddt_root = devm_ioremap(iommu->dev,
             iommu->ddt_phys, PAGE_SIZE);
  if (iommu->ddt_root)
   memset(iommu->ddt_root, 0, PAGE_SIZE);
}

if (!iommu->ddt_root) {
  iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K);
  iommu->ddt_phys = __pa(iommu->ddt_root);
}

if (!iommu->ddt_root)
  return -ENOMEM;

return 0;
}

/*
* Discover supported DDT modes starting from requested value,
* configure DDTP register with accepted mode and root DDT address.
* Accepted iommu->ddt_mode is updated on success.
*/
static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
          unsigned int ddtp_mode)
{
struct device *dev = iommu->dev;
u64 ddtp, rq_ddtp;
unsigned int mode, rq_mode = ddtp_mode;
struct riscv_iommu_command cmd;

ddtp = riscv_iommu_read_ddtp(iommu);
if (ddtp & RISCV_IOMMU_DDTP_BUSY)
  return -EBUSY;

/* Disallow state transition from xLVL to xLVL. */
mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
     mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
     rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
     rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
  return -EINVAL;

do {
  rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
  if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
   rq_ddtp |= phys_to_ppn(iommu->ddt_phys);

  riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
  ddtp = riscv_iommu_read_ddtp(iommu);
  if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
   dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
    rq_mode, ddtp);
   return -EBUSY;
  }

  /* Verify IOMMU hardware accepts new DDTP config. */
  mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);

  if (rq_mode == mode)
   break;

  /* Hardware mandatory DDTP mode has not been accepted. */
  if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
   dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
    ddtp, rq_ddtp);
   return -EINVAL;
  }

  /*
* Mode field is WARL, an IOMMU may support a subset of
* directory table levels in which case if we tried to set
* an unsupported number of levels we'll readback either
* a valid xLVL or off/bare. If we got off/bare, try again
* with a smaller xLVL.
*/
  if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
      rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
   dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
   rq_mode--;
   continue;
  }

  /*
* We tried all supported modes and IOMMU hardware failed to
* accept new settings, something went very wrong since off/bare
* and at least one xLVL must be supported.
*/
  dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
   mode, ddtp_mode);
  return -EINVAL;
} while (1);

iommu->ddt_mode = mode;
if (mode != ddtp_mode)
  dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);

/* Invalidate device context cache */
riscv_iommu_cmd_iodir_inval_ddt(&cmd);
riscv_iommu_cmd_send(iommu, &cmd);

/* Invalidate address translation cache */
riscv_iommu_cmd_inval_vma(&cmd);
riscv_iommu_cmd_send(iommu, &cmd);

/* IOFENCE.C */
riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);

return 0;
}

/* This struct contains protection domain specific IOMMU driver data. */
struct riscv_iommu_domain {
struct iommu_domain domain;
struct list_head bonds;
spinlock_t lock;  /* protect bonds list updates. */
int pscid;
bool amo_enabled;
int numa_node;
unsigned int pgd_mode;
unsigned long *pgd_root;
};

#define iommu_domain_to_riscv(iommu_domain) \
container_of(iommu_domain, struct riscv_iommu_domain, domain)

/* Private IOMMU data for managed devices, dev_iommu_priv_* */
struct riscv_iommu_info {
struct riscv_iommu_domain *domain;
};

/*
* Linkage between an iommu_domain and attached devices.
*
* Protection domain requiring IOATC and DevATC translation cache invalidations,
* should be linked to attached devices using a riscv_iommu_bond structure.
* Devices should be linked to the domain before first use and unlinked after
* the translations from the referenced protection domain can no longer be used.
* Blocking and identity domains are not tracked here, as the IOMMU hardware
* does not cache negative and/or identity (BARE mode) translations, and DevATC
* is disabled for those protection domains.
*
* The device pointer and IOMMU data remain stable in the bond struct after
* _probe_device() where it's attached to the managed IOMMU, up to the
* completion of the _release_device() call. The release of the bond structure
* is synchronized with the device release.
*/
struct riscv_iommu_bond {
struct list_head list;
struct rcu_head rcu;
struct device *dev;
};

static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
     struct device *dev)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_bond *bond;
struct list_head *bonds;

bond = kzalloc(sizeof(*bond), GFP_KERNEL);
if (!bond)
  return -ENOMEM;
bond->dev = dev;

/*
* List of devices attached to the domain is arranged based on
* managed IOMMU device.
*/

spin_lock(&domain->lock);
list_for_each(bonds, &domain->bonds)
  if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
   break;
list_add_rcu(&bond->list, bonds);
spin_unlock(&domain->lock);

/* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
smp_mb();

return 0;
}

static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
        struct device *dev)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_bond *bond, *found = NULL;
struct riscv_iommu_command cmd;
int count = 0;

if (!domain)
  return;

spin_lock(&domain->lock);
list_for_each_entry(bond, &domain->bonds, list) {
  if (found && count)
   break;
  else if (bond->dev == dev)
   found = bond;
  else if (dev_to_iommu(bond->dev) == iommu)
   count++;
}
if (found)
  list_del_rcu(&found->list);
spin_unlock(&domain->lock);
kfree_rcu(found, rcu);

/*
* If this was the last bond between this domain and the IOMMU
* invalidate all cached entries for domain's PSCID.
*/
if (!count) {
  riscv_iommu_cmd_inval_vma(&cmd);
  riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
  riscv_iommu_cmd_send(iommu, &cmd);

  riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
}
}

/*
* Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
* This limit will be replaced with range invalidations, if supported by
* the hardware, when RISC-V IOMMU architecture specification update for
* range invalidations update will be available.
*/
#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)

static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
        unsigned long start, unsigned long end)
{
struct riscv_iommu_bond *bond;
struct riscv_iommu_device *iommu, *prev;
struct riscv_iommu_command cmd;
unsigned long len = end - start + 1;
unsigned long iova;

/*
* For each IOMMU linked with this protection domain (via bonds->dev),
* an IOTLB invaliation command will be submitted and executed.
*
* Possbile race with domain attach flow is handled by sequencing
* bond creation - riscv_iommu_bond_link(), and device directory
* update - riscv_iommu_iodir_update().
*
* PTE Update / IOTLB Inval           Device attach & directory update
* --------------------------         --------------------------
* update page table entries          add dev to the bond list
* FENCE RW,RW                        FENCE RW,RW
* For all IOMMUs: (can be empty)     Update FSC/PSCID
*   FENCE IOW,IOW                      FENCE IOW,IOW
*   IOTLB.INVAL                        IODIR.INVAL
*   IOFENCE.C
*
* If bond list is not updated with new device, directory context will
* be configured with already valid page table content. If an IOMMU is
* linked to the protection domain it will receive invalidation
* requests for updated page table entries.
*/
smp_mb();

rcu_read_lock();

prev = NULL;
list_for_each_entry_rcu(bond, &domain->bonds, list) {
  iommu = dev_to_iommu(bond->dev);

  /*
* IOTLB invalidation request can be safely omitted if already sent
* to the IOMMU for the same PSCID, and with domain->bonds list
* arranged based on the device's IOMMU, it's sufficient to check
* last device the invalidation was sent to.
*/
  if (iommu == prev)
   continue;

  riscv_iommu_cmd_inval_vma(&cmd);
  riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
  if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
   for (iova = start; iova < end; iova += PAGE_SIZE) {
    riscv_iommu_cmd_inval_set_addr(&cmd, iova);
    riscv_iommu_cmd_send(iommu, &cmd);
   }
  } else {
   riscv_iommu_cmd_send(iommu, &cmd);
  }
  prev = iommu;
}

prev = NULL;
list_for_each_entry_rcu(bond, &domain->bonds, list) {
  iommu = dev_to_iommu(bond->dev);
  if (iommu == prev)
   continue;

  riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
  prev = iommu;
}
rcu_read_unlock();
}

#define RISCV_IOMMU_FSC_BARE 0

/*
* Update IODIR for the device.
*
* During the execution of riscv_iommu_probe_device(), IODIR entries are
* allocated for the device's identifiers.  Device context invalidation
* becomes necessary only if one of the updated entries was previously
* marked as valid, given that invalid device context entries are not
* cached by the IOMMU hardware.
* In this implementation, updating a valid device context while the
* device is not quiesced might be disruptive, potentially causing
* interim translation faults.
*/
static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
         struct device *dev, u64 fsc, u64 ta)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct riscv_iommu_dc *dc;
struct riscv_iommu_command cmd;
bool sync_required = false;
u64 tc;
int i;

for (i = 0; i < fwspec->num_ids; i++) {
  dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
  tc = READ_ONCE(dc->tc);
  if (!(tc & RISCV_IOMMU_DC_TC_V))
   continue;

  WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);

  /* Invalidate device context cached values */
  riscv_iommu_cmd_iodir_inval_ddt(&cmd);
  riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
  riscv_iommu_cmd_send(iommu, &cmd);
  sync_required = true;
}

if (sync_required)
  riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);

/*
* For device context with DC_TC_PDTV = 0, translation attributes valid bit
* is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
*/
for (i = 0; i < fwspec->num_ids; i++) {
  dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
  tc = READ_ONCE(dc->tc);
  tc |= ta & RISCV_IOMMU_DC_TC_V;

  WRITE_ONCE(dc->fsc, fsc);
  WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
  /* Update device context, write TC.V as the last step. */
  dma_wmb();
  WRITE_ONCE(dc->tc, tc);

  /* Invalidate device context after update */
  riscv_iommu_cmd_iodir_inval_ddt(&cmd);
  riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
  riscv_iommu_cmd_send(iommu, &cmd);
}

riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
}

/*
* IOVA page translation tree management.
*/

static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);

riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
}

static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
       struct iommu_iotlb_gather *gather)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);

riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
}

#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))

#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
#define _io_pte_none(pte) ((pte) == 0)
#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))

static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
     unsigned long pte,
     struct iommu_pages_list *freelist)
{
unsigned long *ptr;
int i;

if (!_io_pte_present(pte) || _io_pte_leaf(pte))
  return;

ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));

/* Recursively free all sub page table pages */
for (i = 0; i < PTRS_PER_PTE; i++) {
  pte = READ_ONCE(ptr[i]);
  if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
   riscv_iommu_pte_free(domain, pte, freelist);
}

if (freelist)
  iommu_pages_list_add(freelist, ptr);
else
  iommu_free_pages(ptr);
}

static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
         unsigned long iova, size_t pgsize,
         gfp_t gfp)
{
unsigned long *ptr = domain->pgd_root;
unsigned long pte, old;
int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
void *addr;

do {
  const int shift = PAGE_SHIFT + PT_SHIFT * level;

  ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
  /*
* Note: returned entry might be a non-leaf if there was
* existing mapping with smaller granularity. Up to the caller
* to replace and invalidate.
*/
  if (((size_t)1 << shift) == pgsize)
   return ptr;
pte_retry:
  pte = READ_ONCE(*ptr);
  /*
* This is very likely incorrect as we should not be adding
* new mapping with smaller granularity on top
* of existing 2M/1G mapping. Fail.
*/
  if (_io_pte_present(pte) && _io_pte_leaf(pte))
   return NULL;
  /*
* Non-leaf entry is missing, allocate and try to add to the
* page table. This might race with other mappings, retry.
*/
  if (_io_pte_none(pte)) {
   addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp,
        SZ_4K);
   if (!addr)
    return NULL;
   old = pte;
   pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
   if (cmpxchg_relaxed(ptr, old, pte) != old) {
    iommu_free_pages(addr);
    goto pte_retry;
   }
  }
  ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
} while (level-- > 0);

return NULL;
}

static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
         unsigned long iova, size_t *pte_pgsize)
{
unsigned long *ptr = domain->pgd_root;
unsigned long pte;
int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;

do {
  const int shift = PAGE_SHIFT + PT_SHIFT * level;

  ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
  pte = READ_ONCE(*ptr);
  if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
   *pte_pgsize = (size_t)1 << shift;
   return ptr;
  }
  if (_io_pte_none(pte))
   return NULL;
  ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
} while (level-- > 0);

return NULL;
}

static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
     unsigned long iova, phys_addr_t phys,
     size_t pgsize, size_t pgcount, int prot,
     gfp_t gfp, size_t *mapped)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t size = 0;
unsigned long *ptr;
unsigned long pte, old, pte_prot;
int rc = 0;
struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);

if (!(prot & IOMMU_WRITE))
  pte_prot = _PAGE_BASE | _PAGE_READ;
else if (domain->amo_enabled)
  pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
else
  pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;

while (pgcount) {
  ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
  if (!ptr) {
   rc = -ENOMEM;
   break;
  }

  old = READ_ONCE(*ptr);
  pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
  if (cmpxchg_relaxed(ptr, old, pte) != old)
   continue;

  riscv_iommu_pte_free(domain, old, &freelist);

  size += pgsize;
  iova += pgsize;
  phys += pgsize;
  --pgcount;
}

*mapped = size;

if (!iommu_pages_list_empty(&freelist)) {
  /*
* In 1.0 spec version, the smallest scope we can use to
* invalidate all levels of page table (i.e. leaf and non-leaf)
* is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
* This will be updated with hardware support for
* capability.NL (non-leaf) IOTINVAL command.
*/
  riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
  iommu_put_pages_list(&freelist);
}

return rc;
}

static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
          unsigned long iova, size_t pgsize,
          size_t pgcount,
          struct iommu_iotlb_gather *gather)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t size = pgcount << __ffs(pgsize);
unsigned long *ptr, old;
size_t unmapped = 0;
size_t pte_size;

while (unmapped < size) {
  ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
  if (!ptr)
   return unmapped;

  /* partial unmap is not allowed, fail. */
  if (iova & (pte_size - 1))
   return unmapped;

  old = READ_ONCE(*ptr);
  if (cmpxchg_relaxed(ptr, old, 0) != old)
   continue;

  iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
         pte_size);

  iova += pte_size;
  unmapped += pte_size;
}

return unmapped;
}

static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
         dma_addr_t iova)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t pte_size;
unsigned long *ptr;

ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
if (!ptr)
  return 0;

return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
}

static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
const unsigned long pfn = virt_to_pfn(domain->pgd_root);

WARN_ON(!list_empty(&domain->bonds));

if ((int)domain->pscid > 0)
  ida_free(&riscv_iommu_pscids, domain->pscid);

riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
kfree(domain);
}

static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
{
switch (pgd_mode) {
case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
  return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;

case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
  return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;

case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
  return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
}
return false;
}

static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
         struct device *dev)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
u64 fsc, ta;

if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
  return -ENODEV;

fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
       FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
      RISCV_IOMMU_PC_TA_V;

if (riscv_iommu_bond_link(domain, dev))
  return -ENOMEM;

riscv_iommu_iodir_update(iommu, dev, fsc, ta);
riscv_iommu_bond_unlink(info->domain, dev);
info->domain = domain;

return 0;
}

static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
.attach_dev = riscv_iommu_attach_paging_domain,
.free = riscv_iommu_free_paging_domain,
.map_pages = riscv_iommu_map_pages,
.unmap_pages = riscv_iommu_unmap_pages,
.iova_to_phys = riscv_iommu_iova_to_phys,
.iotlb_sync = riscv_iommu_iotlb_sync,
.flush_iotlb_all = riscv_iommu_iotlb_flush_all,
};

static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
{
struct riscv_iommu_domain *domain;
struct riscv_iommu_device *iommu;
unsigned int pgd_mode;
dma_addr_t va_mask;
int va_bits;

iommu = dev_to_iommu(dev);
if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
  pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
  va_bits = 57;
} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
  pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
  va_bits = 48;
} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
  pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
  va_bits = 39;
} else {
  dev_err(dev, "cannot find supported page table mode\n");
  return ERR_PTR(-ENODEV);
}

domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
  return ERR_PTR(-ENOMEM);

INIT_LIST_HEAD_RCU(&domain->bonds);
spin_lock_init(&domain->lock);
domain->numa_node = dev_to_node(iommu->dev);
domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
domain->pgd_mode = pgd_mode;
domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node,
           GFP_KERNEL_ACCOUNT, SZ_4K);
if (!domain->pgd_root) {
  kfree(domain);
  return ERR_PTR(-ENOMEM);
}

domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
     RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
if (domain->pscid < 0) {
  iommu_free_pages(domain->pgd_root);
  kfree(domain);
  return ERR_PTR(-ENOMEM);
}

/*
* Note: RISC-V Privilege spec mandates that virtual addresses
* need to be sign-extended, so if (VA_BITS - 1) is set, all
* bits >= VA_BITS need to also be set or else we'll get a
* page fault. However the code that creates the mappings
* above us (e.g. iommu_dma_alloc_iova()) won't do that for us
* for now, so we'll end up with invalid virtual addresses
* to map. As a workaround until we get this sorted out
* limit the available virtual addresses to VA_BITS - 1.
*/
va_mask = DMA_BIT_MASK(va_bits - 1);

domain->domain.geometry.aperture_start = 0;
domain->domain.geometry.aperture_end = va_mask;
domain->domain.geometry.force_aperture = true;
domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);

domain->domain.ops = &riscv_iommu_paging_domain_ops;

return &domain->domain;
}

static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
           struct device *dev)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);

/* Make device context invalid, translation requests will fault w/ #258 */
riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
riscv_iommu_bond_unlink(info->domain, dev);
info->domain = NULL;

return 0;
}

static struct iommu_domain riscv_iommu_blocking_domain = {
.type = IOMMU_DOMAIN_BLOCKED,
.ops = &(const struct iommu_domain_ops) {
  .attach_dev = riscv_iommu_attach_blocking_domain,
}
};

static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
           struct device *dev)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);

riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
riscv_iommu_bond_unlink(info->domain, dev);
info->domain = NULL;

return 0;
}

static struct iommu_domain riscv_iommu_identity_domain = {
.type = IOMMU_DOMAIN_IDENTITY,
.ops = &(const struct iommu_domain_ops) {
  .attach_dev = riscv_iommu_attach_identity_domain,
}
};

static struct iommu_group *riscv_iommu_device_group(struct device *dev)
{
if (dev_is_pci(dev))
  return pci_device_group(dev);
return generic_device_group(dev);
}

static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
{
return iommu_fwspec_add_ids(dev, args->args, 1);
}

static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct riscv_iommu_device *iommu;
struct riscv_iommu_info *info;
struct riscv_iommu_dc *dc;
u64 tc;
int i;

if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
  return ERR_PTR(-ENODEV);

iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
if (!iommu)
  return ERR_PTR(-ENODEV);

/*
* IOMMU hardware operating in fail-over BARE mode will provide
* identity translation for all connected devices anyway...
*/
if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
  return ERR_PTR(-ENODEV);

info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
  return ERR_PTR(-ENOMEM);
/*
* Allocate and pre-configure device context entries in
* the device directory. Do not mark the context valid yet.
*/
tc = 0;
if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
  tc |= RISCV_IOMMU_DC_TC_SADE;
for (i = 0; i < fwspec->num_ids; i++) {
  dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
  if (!dc) {
   kfree(info);
   return ERR_PTR(-ENODEV);
  }
  if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
   dev_warn(dev, "already attached to IOMMU device directory\n");
  WRITE_ONCE(dc->tc, tc);
}

dev_iommu_priv_set(dev, info);

return &iommu->iommu;
}

static void riscv_iommu_release_device(struct device *dev)
{
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);

kfree_rcu_mightsleep(info);
}

static const struct iommu_ops riscv_iommu_ops = {
.of_xlate = riscv_iommu_of_xlate,
.identity_domain = &riscv_iommu_identity_domain,
.blocked_domain = &riscv_iommu_blocking_domain,
.release_domain = &riscv_iommu_blocking_domain,
.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
.device_group = riscv_iommu_device_group,
.probe_device = riscv_iommu_probe_device,
.release_device = riscv_iommu_release_device,
};

static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
{
u64 ddtp;

/*
* Make sure the IOMMU is switched off or in pass-through mode during
* regular boot flow and disable translation when we boot into a kexec
* kernel and the previous kernel left them enabled.
*/
ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
if (ddtp & RISCV_IOMMU_DDTP_BUSY)
  return -EBUSY;

if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
      RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
  if (!is_kdump_kernel())
   return -EBUSY;
  riscv_iommu_disable(iommu);
}

/* Configure accesses to in-memory data structures for CPU-native byte order. */
if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
     !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
  if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
   return -EINVAL;
  riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
       iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
  iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
  if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
      !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
   return -EINVAL;
}

/*
* Distribute interrupt vectors, always use first vector for CIV.
* At least one interrupt is required. Read back and verify.
*/
if (!iommu->irqs_count)
  return -EINVAL;

iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
         FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
         FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
      FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
  max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
      FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
  return -EINVAL;

return 0;
}

void riscv_iommu_remove(struct riscv_iommu_device *iommu)
{
iommu_device_unregister(&iommu->iommu);
iommu_device_sysfs_remove(&iommu->iommu);
riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
riscv_iommu_queue_disable(&iommu->cmdq);
riscv_iommu_queue_disable(&iommu->fltq);
}

int riscv_iommu_init(struct riscv_iommu_device *iommu)
{
int rc;

RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);

rc = riscv_iommu_init_check(iommu);
if (rc)
  return dev_err_probe(iommu->dev, rc, "unexpected device state\n");

rc = riscv_iommu_iodir_alloc(iommu);
if (rc)
  return rc;

rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
         sizeof(struct riscv_iommu_command));
if (rc)
  return rc;

rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
         sizeof(struct riscv_iommu_fq_record));
if (rc)
  return rc;

rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
if (rc)
  return rc;

rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
if (rc)
  goto err_queue_disable;

rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
if (rc)
  goto err_queue_disable;

rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
        dev_name(iommu->dev));
if (rc) {
  dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
  goto err_iodir_off;
}

rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
if (rc) {
  dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
  goto err_remove_sysfs;
}

return 0;

err_remove_sysfs:
iommu_device_sysfs_remove(&iommu->iommu);
err_iodir_off:
riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
err_queue_disable:
riscv_iommu_queue_disable(&iommu->fltq);
riscv_iommu_queue_disable(&iommu->cmdq);
return rc;
}

Messung V0.5

¤ Dauer der Verarbeitung: 0.15 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.