/* Copyright (c) 2019-2021, The Linux Foundation. All rights reserved. */ /* Copyright (c) 2021-2023 Qualcomm Innovation Center, Inc. All rights reserved. */
staticunsignedint datapath_poll_interval_us = 100; /* 100 usec default */
module_param(datapath_poll_interval_us, uint, 0600);
MODULE_PARM_DESC(datapath_poll_interval_us, "Amount of time to sleep between activity when datapath polling is enabled");
struct dbc_req { /* * A request ID is assigned to each memory handle going in DMA queue. * As a single memory handle can enqueue multiple elements in DMA queue * all of them will have the same request ID.
*/
__le16 req_id; /* Future use */
__u8 seq_id; /* * Special encoded variable * 7 0 - Do not force to generate MSI after DMA is completed * 1 - Force to generate MSI after DMA is completed * 6:5 Reserved * 4 1 - Generate completion element in the response queue * 0 - No Completion Code * 3 0 - DMA request is a Link list transfer * 1 - DMA request is a Bulk transfer * 2 Reserved * 1:0 00 - No DMA transfer involved * 01 - DMA transfer is part of inbound transfer * 10 - DMA transfer has outbound transfer * 11 - NA
*/
__u8 cmd;
__le32 resv; /* Source address for the transfer */
__le64 src_addr; /* Destination address for the transfer */
__le64 dest_addr; /* Length of transfer request */
__le32 len;
__le32 resv2; /* Doorbell address */
__le64 db_addr; /* * Special encoded variable * 7 1 - Doorbell(db) write * 0 - No doorbell write * 6:2 Reserved * 1:0 00 - 32 bit access, db address must be aligned to 32bit-boundary * 01 - 16 bit access, db address must be aligned to 16bit-boundary * 10 - 8 bit access, db address must be aligned to 8bit-boundary * 11 - Reserved
*/
__u8 db_len;
__u8 resv3;
__le16 resv4; /* 32 bit data written to doorbell address */
__le32 db_data; /* * Special encoded variable * All the fields of sem_cmdX are passed from user and all are ORed * together to form sem_cmd. * 0:11 Semaphore value * 15:12 Reserved * 20:16 Semaphore index * 21 Reserved * 22 Semaphore Sync * 23 Reserved * 26:24 Semaphore command * 28:27 Reserved * 29 Semaphore DMA out bound sync fence * 30 Semaphore DMA in bound sync fence * 31 Enable semaphore command
*/
__le32 sem_cmd0;
__le32 sem_cmd1;
__le32 sem_cmd2;
__le32 sem_cmd3;
} __packed;
struct dbc_rsp { /* Request ID of the memory handle whose DMA transaction is completed */
__le16 req_id; /* Status of the DMA transaction. 0 : Success otherwise failure */
__le16 status;
} __packed;
switch (req->db_len) { case 32:
db_len = BIT(7); break; case 16:
db_len = BIT(7) | 1; break; case 8:
db_len = BIT(7) | 2; break; case 0:
db_len = 0; /* doorbell is not active for this command */ break; default: return -EINVAL; /* should never hit this */
}
/* * When we end up splitting up a single request (ie a buf slice) into * multiple DMA requests, we have to manage the sync data carefully. * There can only be one presync sem. That needs to be on every xfer * so that the DMA engine doesn't transfer data before the receiver is * ready. We only do the doorbell and postsync sems after the xfer. * To guarantee previous xfers for the request are complete, we use a * fence.
*/
dev_addr = req->dev_addr;
for_each_sgtable_dma_sg(slice->sgt, sg, i) {
slice->reqs[i].cmd = cmd;
slice->reqs[i].src_addr = cpu_to_le64(slice->dir == DMA_TO_DEVICE ?
sg_dma_address(sg) : dev_addr);
slice->reqs[i].dest_addr = cpu_to_le64(slice->dir == DMA_TO_DEVICE ?
dev_addr : sg_dma_address(sg)); /* * sg_dma_len(sg) returns size of a DMA segment, maximum DMA * segment size is set to UINT_MAX by qaic and hence return * values of sg_dma_len(sg) can never exceed u32 range. So, * by down sizing we are not corrupting the value.
*/
slice->reqs[i].len = cpu_to_le32((u32)sg_dma_len(sg)); switch (presync_sem) { case BIT(0):
slice->reqs[i].sem_cmd0 = cpu_to_le32(ENCODE_SEM(req->sem0.val,
req->sem0.index,
req->sem0.presync,
req->sem0.cmd,
req->sem0.flags)); break; case BIT(1):
slice->reqs[i].sem_cmd1 = cpu_to_le32(ENCODE_SEM(req->sem1.val,
req->sem1.index,
req->sem1.presync,
req->sem1.cmd,
req->sem1.flags)); break; case BIT(2):
slice->reqs[i].sem_cmd2 = cpu_to_le32(ENCODE_SEM(req->sem2.val,
req->sem2.index,
req->sem2.presync,
req->sem2.cmd,
req->sem2.flags)); break; case BIT(3):
slice->reqs[i].sem_cmd3 = cpu_to_le32(ENCODE_SEM(req->sem3.val,
req->sem3.index,
req->sem3.presync,
req->sem3.cmd,
req->sem3.flags)); break;
}
dev_addr += sg_dma_len(sg);
} /* add post transfer stuff to last segment */
i--;
slice->reqs[i].cmd |= GEN_COMPLETION;
slice->reqs[i].db_addr = db_addr;
slice->reqs[i].db_len = db_len;
slice->reqs[i].db_data = db_data; /* * Add a fence if we have more than one request going to the hardware * representing the entirety of the user request, and the user request * has no presync condition. * Fences are expensive, so we try to avoid them. We rely on the * hardware behavior to avoid needing one when there is a presync * condition. When a presync exists, all requests for that same * presync will be queued into a fifo. Thus, since we queue the * post xfer activity only on the last request we queue, the hardware * will ensure that the last queued request is processed last, thus * making sure the post xfer activity happens at the right time without * a fence.
*/ if (i && !presync_sem)
req->sem0.flags |= (slice->dir == DMA_TO_DEVICE ?
QAIC_SEM_INSYNCFENCE : QAIC_SEM_OUTSYNCFENCE);
slice->reqs[i].sem_cmd0 = cpu_to_le32(ENCODE_SEM(req->sem0.val, req->sem0.index,
req->sem0.presync, req->sem0.cmd,
req->sem0.flags));
slice->reqs[i].sem_cmd1 = cpu_to_le32(ENCODE_SEM(req->sem1.val, req->sem1.index,
req->sem1.presync, req->sem1.cmd,
req->sem1.flags));
slice->reqs[i].sem_cmd2 = cpu_to_le32(ENCODE_SEM(req->sem2.val, req->sem2.index,
req->sem2.presync, req->sem2.cmd,
req->sem2.flags));
slice->reqs[i].sem_cmd3 = cpu_to_le32(ENCODE_SEM(req->sem3.val, req->sem3.index,
req->sem3.presync, req->sem3.cmd,
req->sem3.flags));
staticint create_sgt(struct qaic_device *qdev, struct sg_table **sgt_out, u64 size)
{ struct scatterlist *sg; struct sg_table *sgt; struct page **pages; int *pages_order; int buf_extra; int max_order; int nr_pages; int ret = 0; int i, j, k; int order;
if (size) {
nr_pages = DIV_ROUND_UP(size, PAGE_SIZE); /* * calculate how much extra we are going to allocate, to remove * later
*/
buf_extra = (PAGE_SIZE - size % PAGE_SIZE) % PAGE_SIZE;
max_order = min(MAX_PAGE_ORDER, get_order(size));
} else { /* allocate a single page for book keeping */
nr_pages = 1;
buf_extra = 0;
max_order = 0;
}
/* * Allocate requested memory using alloc_pages. It is possible to allocate * the requested memory in multiple chunks by calling alloc_pages * multiple times. Use SG table to handle multiple allocated pages.
*/
i = 0; while (nr_pages > 0) {
order = min(get_order(nr_pages * PAGE_SIZE), max_order); while (1) {
pages[i] = alloc_pages(GFP_KERNEL | GFP_HIGHUSER |
__GFP_NOWARN | __GFP_ZERO |
(order ? __GFP_NORETRY : __GFP_RETRY_MAYFAIL),
order); if (pages[i]) break; if (!order--) {
ret = -ENOMEM; goto free_partial_alloc;
}
}
max_order = order;
pages_order[i] = order;
nr_pages -= 1 << order; if (nr_pages <= 0) /* account for over allocation */
buf_extra += abs(nr_pages) * PAGE_SIZE;
i++;
}
sgt = kmalloc(sizeof(*sgt), GFP_KERNEL); if (!sgt) {
ret = -ENOMEM; goto free_partial_alloc;
}
if (sg_alloc_table(sgt, i, GFP_KERNEL)) {
ret = -ENOMEM; goto free_sgt;
}
/* Populate the SG table with the allocated memory pages */
sg = sgt->sgl; for (k = 0; k < i; k++, sg = sg_next(sg)) { /* Last entry requires special handling */ if (k < i - 1) {
sg_set_page(sg, pages[k], PAGE_SIZE << pages_order[k], 0);
} else {
sg_set_page(sg, pages[k], (PAGE_SIZE << pages_order[k]) - buf_extra, 0);
sg_mark_end(sg);
}
}
bo = qaic_alloc_init_bo(); if (IS_ERR(bo)) {
ret = PTR_ERR(bo); goto out;
}
obj = &bo->base;
get_dma_buf(dma_buf);
attach = dma_buf_attach(dma_buf, dev->dev); if (IS_ERR(attach)) {
ret = PTR_ERR(attach); goto attach_fail;
}
if (!attach->dmabuf->size) {
ret = -EINVAL; goto size_align_fail;
}
drm_gem_private_object_init(dev, obj, attach->dmabuf->size); /* * skipping dma_buf_map_attachment() as we do not know the direction * just yet. Once the direction is known in the subsequent IOCTL to * attach slicing, we can do it then.
*/
/* * After this for loop is complete, first_n represents the index * of the last DMA request of this slice that needs to be * transferred after resizing and last_bytes represents DMA size * of that request.
*/
last_bytes = resize; for (first_n = 0; first_n < slice->nents; first_n++) if (last_bytes > le32_to_cpu(reqs[first_n].len))
last_bytes -= le32_to_cpu(reqs[first_n].len); else break;
/* * Copy over the last entry. Here we need to adjust len to the left over * size, and set src and dst to the entry it is copied to.
*/
last_req = fifo_at(dbc->req_q_base, (tail + first_n) % dbc->nelem);
memcpy(last_req, reqs + slice->nents - 1, sizeof(*reqs));
/* * last_bytes holds size of a DMA segment, maximum DMA segment size is * set to UINT_MAX by qaic and hence last_bytes can never exceed u32 * range. So, by down sizing we are not corrupting the value.
*/
last_req->len = cpu_to_le32((u32)last_bytes);
last_req->src_addr = reqs[first_n].src_addr;
last_req->dest_addr = reqs[first_n].dest_addr; if (!last_bytes) /* Disable DMA transfer */
last_req->cmd = GENMASK(7, 2) & reqs[first_n].cmd;
for (i = 0; i < count; i++) { /* * ref count will be decremented when the transfer of this * buffer is complete. It is inside dbc_irq_threaded_fn().
*/
obj = drm_gem_object_lookup(file_priv,
is_partial ? pexec[i].handle : exec[i].handle); if (!obj) {
ret = -ENOENT; goto failed_to_send_bo;
}
bo = to_qaic_bo(obj);
ret = mutex_lock_interruptible(&bo->lock); if (ret) goto failed_to_send_bo;
if (!bo->sliced) {
ret = -EINVAL; goto unlock_bo;
}
if (is_partial && pexec[i].resize > bo->base.size) {
ret = -EINVAL; goto unlock_bo;
}
spin_lock_irqsave(&dbc->xfer_lock, flags); if (bo_queued(bo)) {
spin_unlock_irqrestore(&dbc->xfer_lock, flags);
ret = -EINVAL; goto unlock_bo;
}
for (i = 0; i < count; i++) { /* * Since we already committed the BO to hardware, the only way * this should fail is a pending signal. We can't cancel the * submit to hardware, so we have to just skip the profiling * data. In case the signal is not fatal to the process, we * return success so that the user doesn't try to resubmit.
*/
obj = drm_gem_object_lookup(file_priv,
is_partial ? pexec[i].handle : exec[i].handle); if (!obj) break;
bo = to_qaic_bo(obj);
bo->perf_stats.req_received_ts = received_ts;
bo->perf_stats.req_submit_ts = submit_ts;
bo->perf_stats.queue_level_before = queue_level;
queue_level += bo->total_slice_nents;
drm_gem_object_put(obj);
}
}
/* * Our interrupt handling is a bit more complicated than a simple ideal, but * sadly necessary. * * Each dbc has a completion queue. Entries in the queue correspond to DMA * requests which the device has processed. The hardware already has a built * in irq mitigation. When the device puts an entry into the queue, it will * only trigger an interrupt if the queue was empty. Therefore, when adding * the Nth event to a non-empty queue, the hardware doesn't trigger an * interrupt. This means the host doesn't get additional interrupts signaling * the same thing - the queue has something to process. * This behavior can be overridden in the DMA request. * This means that when the host receives an interrupt, it is required to * drain the queue. * * This behavior is what NAPI attempts to accomplish, although we can't use * NAPI as we don't have a netdev. We use threaded irqs instead. * * However, there is a situation where the host drains the queue fast enough * that every event causes an interrupt. Typically this is not a problem as * the rate of events would be low. However, that is not the case with * lprnet for example. On an Intel Xeon D-2191 where we run 8 instances of * lprnet, the host receives roughly 80k interrupts per second from the device * (per /proc/interrupts). While NAPI documentation indicates the host should * just chug along, sadly that behavior causes instability in some hosts. * * Therefore, we implement an interrupt disable scheme similar to NAPI. The * key difference is that we will delay after draining the queue for a small * time to allow additional events to come in via polling. Using the above * lprnet workload, this reduces the number of interrupts processed from * ~80k/sec to about 64 in 5 minutes and appears to solve the system * instability.
*/
irqreturn_t dbc_irq_handler(int irq, void *data)
{ struct dma_bridge_chan *dbc = data; int rcu_id;
u32 head;
u32 tail;
rcu_id = srcu_read_lock(&dbc->ch_lock);
if (datapath_polling) {
srcu_read_unlock(&dbc->ch_lock, rcu_id); /* * Normally datapath_polling will not have irqs enabled, but * when running with only one MSI the interrupt is shared with * MHI so it cannot be disabled. Return ASAP instead.
*/ return IRQ_HANDLED;
}
if (!dbc->usr) {
srcu_read_unlock(&dbc->ch_lock, rcu_id); return IRQ_HANDLED;
}
head = readl(dbc->dbc_base + RSPHP_OFF); if (head == U32_MAX) { /* PCI link error */
srcu_read_unlock(&dbc->ch_lock, rcu_id); return IRQ_NONE;
}
tail = readl(dbc->dbc_base + RSPTP_OFF); if (tail == U32_MAX) { /* PCI link error */
srcu_read_unlock(&dbc->ch_lock, rcu_id); return IRQ_NONE;
}
head = readl(dbc->dbc_base + RSPHP_OFF); if (head == U32_MAX) /* PCI link error */ goto error_out;
read_fifo:
if (!event_count) {
event_count = NUM_EVENTS;
cond_resched();
}
/* * if this channel isn't assigned or gets unassigned during processing * we have nothing further to do
*/ if (!dbc->usr) goto error_out;
tail = readl(dbc->dbc_base + RSPTP_OFF); if (tail == U32_MAX) /* PCI link error */ goto error_out;
if (head == tail) { /* queue empty */ if (delay_count) {
--delay_count;
usleep_range(100, 200); goto read_fifo; /* check for a new event */
} goto normal_out;
}
delay_count = NUM_DELAYS; while (head != tail) { if (!event_count) break;
--event_count;
rsp = dbc->rsp_q_base + head * sizeof(*rsp);
req_id = le16_to_cpu(rsp->req_id);
status = le16_to_cpu(rsp->status); if (status)
pci_dbg(qdev->pdev, "req_id %d failed with status %d\n", req_id, status);
spin_lock_irqsave(&dbc->xfer_lock, flags); /* * A BO can receive multiple interrupts, since a BO can be * divided into multiple slices and a buffer receives as many * interrupts as slices. So until it receives interrupts for * all the slices we cannot mark that buffer complete.
*/
list_for_each_entry_safe(bo, i, &dbc->xfer_list, xfer_list) { if (bo->req_id == req_id)
bo->nr_slice_xfer_done++; else continue;
if (bo->nr_slice_xfer_done < bo->nr_slice) break;
/* * At this point we have received all the interrupts for * BO, which means BO execution is complete.
*/
dma_sync_sgtable_for_cpu(&qdev->pdev->dev, bo->sgt, bo->dir);
bo->nr_slice_xfer_done = 0;
list_del_init(&bo->xfer_list);
bo->perf_stats.req_processed_ts = ktime_get_ns();
complete_all(&bo->xfer_done);
drm_gem_object_put(&bo->base); break;
}
spin_unlock_irqrestore(&dbc->xfer_lock, flags);
head = (head + 1) % dbc->nelem;
}
/* * Update the head pointer of response queue and let the device know * that we have consumed elements from the queue.
*/
writel(head, dbc->dbc_base + RSPHP_OFF);
/* elements might have been put in the queue while we were processing */ goto read_fifo;
normal_out: if (!qdev->single_msi && likely(!datapath_polling))
enable_irq(irq); elseif (unlikely(datapath_polling))
schedule_work(&dbc->poll_work); /* checking the fifo and enabling irqs is a race, missed event check */
tail = readl(dbc->dbc_base + RSPTP_OFF); if (tail != U32_MAX && head != tail) { if (!qdev->single_msi && likely(!datapath_polling))
disable_irq_nosync(irq); goto read_fifo;
}
srcu_read_unlock(&dbc->ch_lock, rcu_id); return IRQ_HANDLED;
usr = file_priv->driver_priv;
usr_rcu_id = srcu_read_lock(&usr->qddev_lock); if (!usr->qddev) {
ret = -ENODEV; goto unlock_usr_srcu;
}
qdev = usr->qddev->qdev;
qdev_rcu_id = srcu_read_lock(&qdev->dev_lock); if (qdev->dev_state != QAIC_ONLINE) {
ret = -ENODEV; goto unlock_dev_srcu;
}
obj = drm_gem_object_lookup(file_priv, args->handle); if (!obj) {
ret = -ENOENT; goto unlock_dev_srcu;
}
bo = to_qaic_bo(obj);
ret = mutex_lock_interruptible(&bo->lock); if (ret) goto put_bo;
if (!bo->sliced) {
ret = -EINVAL; goto unlock_bo;
}
dbc = bo->dbc;
rcu_id = srcu_read_lock(&dbc->ch_lock); if (dbc->usr != usr) {
ret = -EINVAL; goto unlock_ch_srcu;
}
/* Check if BO is committed to H/W for DMA */
spin_lock_irqsave(&dbc->xfer_lock, flags); if (bo_queued(bo)) {
spin_unlock_irqrestore(&dbc->xfer_lock, flags);
ret = -EBUSY; goto unlock_ch_srcu;
}
spin_unlock_irqrestore(&dbc->xfer_lock, flags);
/** * enable_dbc - Enable the DBC. DBCs are disabled by removing the context of * user. Add user context back to DBC to enable it. This function trusts the * DBC ID passed and expects the DBC to be disabled. * @qdev: Qranium device handle * @dbc_id: ID of the DBC * @usr: User context
*/ void enable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr)
{
qdev->dbc[dbc_id].usr = usr;
}
dbc->usr = NULL;
empty_xfer_list(qdev, dbc);
synchronize_srcu(&dbc->ch_lock); /* * Threads holding channel lock, may add more elements in the xfer_list. * Flush out these elements from xfer_list.
*/
empty_xfer_list(qdev, dbc);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.