/* Given the device's max supported MTU and pages of at least 4KB a packet can * be scattered into at most 4 buffers.
*/ #define RX_MAX_FRAGS 4
/* Per packet headroom in non-XDP mode. Present only for 1-frag packets. */ #define FUN_RX_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)
/* We try to reuse pages for our buffers. To avoid frequent page ref writes we * take EXTRA_PAGE_REFS references at once and then hand them out one per packet * occupying the buffer.
*/ #define EXTRA_PAGE_REFS 1000000 #define MIN_PAGE_REFS 1000
/* See if a page is running low on refs we are holding and if so take more. */ staticvoid refresh_refs(struct funeth_rxbuf *buf)
{ if (unlikely(buf->pg_refs < MIN_PAGE_REFS)) {
buf->pg_refs += EXTRA_PAGE_REFS;
page_ref_add(buf->page, EXTRA_PAGE_REFS);
}
}
/* Offer a buffer to the Rx buffer cache. The cache will hold the buffer if its * page is worth retaining and there's room for it. Otherwise the page is * unmapped and our references released.
*/ staticvoid cache_offer(struct funeth_rxq *q, conststruct funeth_rxbuf *buf)
{ struct funeth_rx_cache *c = &q->cache;
/* Get a page from the Rx buffer cache. We only consider the next available * page and return it if we own all its references.
*/ staticbool cache_get(struct funeth_rxq *q, struct funeth_rxbuf *rb)
{ struct funeth_rx_cache *c = &q->cache; struct funeth_rxbuf *buf;
if (c->prod_cnt == c->cons_cnt) returnfalse; /* empty cache */
/* Page can't be reused. If the cache is full drop this page. */ if (c->prod_cnt - c->cons_cnt > c->mask) {
dma_unmap_page_attrs(q->dma_dev, buf->dma_addr, PAGE_SIZE,
DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
__page_frag_cache_drain(buf->page, buf->pg_refs);
buf->page = NULL;
c->cons_cnt++;
} returnfalse;
}
/* Allocate and DMA-map a page for receive. */ staticint funeth_alloc_page(struct funeth_rxq *q, struct funeth_rxbuf *rb, int node, gfp_t gfp)
{ struct page *p;
if (cache_get(q, rb)) return 0;
p = __alloc_pages_node(node, gfp | __GFP_NOWARN, 0); if (unlikely(!p)) return -ENOMEM;
/* Run the XDP program assigned to an Rx queue. * Return %NULL if the buffer is consumed, or the virtual address of the packet * to turn into an skb.
*/ staticvoid *fun_run_xdp(struct funeth_rxq *q, skb_frag_t *frags, void *buf_va, int ref_ok, struct funeth_txq *xdp_q)
{ struct bpf_prog *xdp_prog; struct xdp_frame *xdpf; struct xdp_buff xdp;
u32 act;
/* VA includes the headroom, frag size includes headroom + tailroom */
xdp_init_buff(&xdp, ALIGN(skb_frag_size(frags), FUN_EPRQ_PKT_ALIGN),
&q->xdp_rxq);
xdp_prepare_buff(&xdp, buf_va, FUN_XDP_HEADROOM, skb_frag_size(frags) -
(FUN_RX_TAILROOM + FUN_XDP_HEADROOM), false);
switch (act) { case XDP_PASS: /* remove headroom, which may not be FUN_XDP_HEADROOM now */
skb_frag_size_set(frags, xdp.data_end - xdp.data);
skb_frag_off_add(frags, xdp.data - xdp.data_hard_start); goto pass; case XDP_TX: if (unlikely(!ref_ok)) goto pass;
/* A CQE contains a fixed completion structure along with optional metadata and * even packet data. Given the start address of a CQE return the start of the * contained fixed structure, which lies at the end.
*/ staticconstvoid *cqe_to_info(constvoid *cqe)
{ return cqe + FUNETH_CQE_INFO_OFFSET;
}
/* The inverse of cqe_to_info(). */ staticconstvoid *info_to_cqe(constvoid *cqe_info)
{ return cqe_info - FUNETH_CQE_INFO_OFFSET;
}
/* Return the type of hash provided by the device based on the L3 and L4 * protocols it parsed for the packet.
*/ staticenum pkt_hash_types cqe_to_pkt_hash_type(u16 pkt_parse)
{ staticconstenum pkt_hash_types htype_map[] = {
PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3,
PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L4,
PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3,
PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3
};
u16 key;
/* Build the key from the TCP/UDP and IP/IPv6 bits */
key = ((pkt_parse >> FUN_ETH_RX_CV_OL4_PROT_S) & 6) |
((pkt_parse >> (FUN_ETH_RX_CV_OL3_PROT_S + 1)) & 1);
return htype_map[key];
}
/* Each received packet can be scattered across several Rx buffers or can * share a buffer with previously received packets depending on the buffer * and packet sizes and the room available in the most recently used buffer. * * The rules are: * - If the buffer at the head of an RQ has not been used it gets (part of) the * next incoming packet. * - Otherwise, if the packet fully fits in the buffer's remaining space the * packet is written there. * - Otherwise, the packet goes into the next Rx buffer. * * This function returns the Rx buffer for a packet or fragment thereof of the * given length. If it isn't @buf it either recycles or frees that buffer * before advancing the queue to the next buffer. * * If called repeatedly with the remaining length of a packet it will walk * through all the buffers containing the packet.
*/ staticstruct funeth_rxbuf *
get_buf(struct funeth_rxq *q, struct funeth_rxbuf *buf, unsignedint len)
{ if (q->buf_offset + len <= PAGE_SIZE || !q->buf_offset) return buf; /* @buf holds (part of) the packet */
/* The packet occupies part of the next buffer. Move there after * replenishing the current buffer slot either with the spare page or * by reusing the slot's existing page. Note that if a spare page isn't * available and the current packet occupies @buf it is a multi-frag * packet that will be dropped leaving @buf available for reuse.
*/ if ((page_ref_count(buf->page) == buf->pg_refs &&
buf->node == numa_mem_id()) || !q->spare_buf.page) {
dma_sync_single_for_device(q->dma_dev, buf->dma_addr,
PAGE_SIZE, DMA_FROM_DEVICE);
refresh_refs(buf);
} else {
cache_offer(q, buf);
*buf = q->spare_buf;
q->spare_buf.page = NULL;
q->rqes[q->rq_cons & q->rq_mask] =
FUN_EPRQ_RQBUF_INIT(buf->dma_addr);
}
q->buf_offset = 0;
q->rq_cons++; return &q->bufs[q->rq_cons & q->rq_mask];
}
/* Gather the page fragments making up the first Rx packet on @q. Its total * length @tot_len includes optional head- and tail-rooms. * * Return 0 if the device retains ownership of at least some of the pages. * In this case the caller may only copy the packet. * * A non-zero return value gives the caller permission to use references to the * pages, e.g., attach them to skbs. Additionally, if the value is <0 at least * one of the pages is PF_MEMALLOC. * * Regardless of outcome the caller is granted a reference to each of the pages.
*/ staticint fun_gather_pkt(struct funeth_rxq *q, unsignedint tot_len,
skb_frag_t *frags)
{ struct funeth_rxbuf *buf = q->cur_buf; unsignedint frag_len; int ref_ok = 1;
for (;;) {
buf = get_buf(q, buf, tot_len);
/* We always keep the RQ full of buffers so before we can give * one of our pages to the stack we require that we can obtain * a replacement page. If we can't the packet will either be * copied or dropped so we can retain ownership of the page and * reuse it.
*/ if (!q->spare_buf.page &&
funeth_alloc_page(q, &q->spare_buf, numa_mem_id(),
GFP_ATOMIC | __GFP_MEMALLOC))
ref_ok = 0;
/* Advance the CQ pointers and phase tag to the next CQE. */ staticvoid advance_cq(struct funeth_rxq *q)
{ if (unlikely(q->cq_head == q->cq_mask)) {
q->cq_head = 0;
q->phase ^= 1;
q->next_cqe_info = cqe_to_info(q->cqes);
} else {
q->cq_head++;
q->next_cqe_info += FUNETH_CQE_SIZE;
}
prefetch(q->next_cqe_info);
}
/* Process the packet represented by the head CQE of @q. Gather the packet's * fragments, run it through the optional XDP program, and if needed construct * an skb and pass it to the stack.
*/ staticvoid fun_handle_cqe_pkt(struct funeth_rxq *q, struct funeth_txq *xdp_q)
{ conststruct fun_eth_cqe *rxreq = info_to_cqe(q->next_cqe_info); unsignedint i, tot_len, pkt_len = be32_to_cpu(rxreq->pkt_len); struct net_device *ndev = q->netdev;
skb_frag_t frags[RX_MAX_FRAGS]; struct skb_shared_info *si; unsignedint headroom;
gro_result_t gro_res; struct sk_buff *skb; int ref_ok; void *va;
u16 cv;
/* account for head- and tail-room, present only for 1-buffer packets */
tot_len = pkt_len;
headroom = be16_to_cpu(rxreq->headroom); if (likely(headroom))
tot_len += FUN_RX_TAILROOM + headroom;
ref_ok = fun_gather_pkt(q, tot_len, frags);
va = skb_frag_address(frags); if (xdp_q && headroom == FUN_XDP_HEADROOM) {
va = fun_run_xdp(q, frags, va, ref_ok, xdp_q); if (!va) return;
headroom = 0; /* XDP_PASS trims it */
} if (unlikely(!ref_ok)) goto no_mem;
if (likely(headroom)) { /* headroom is either FUN_RX_HEADROOM or FUN_XDP_HEADROOM */
prefetch(va + headroom);
skb = napi_build_skb(va, ALIGN(tot_len, FUN_EPRQ_PKT_ALIGN)); if (unlikely(!skb)) goto no_mem;
/* Release the references we've been granted for the frag pages. * We return the ref of the last frag and free the rest.
*/
q->cur_buf->pg_refs++; for (i = 0; i < rxreq->nsgl - 1; i++)
__free_page(skb_frag_page(frags + i));
}
/* Return 0 if the phase tag of the CQE at the CQ's head matches expectations * indicating the CQE is new.
*/ static u16 cqe_phase_mismatch(conststruct fun_cqe_info *ci, u16 phase)
{
u16 sf_p = be16_to_cpu(ci->sf_p);
return (sf_p & 1) ^ phase;
}
/* Walk through a CQ identifying and processing fresh CQEs up to the given * budget. Return the remaining budget.
*/ staticint fun_process_cqes(struct funeth_rxq *q, int budget)
{ struct funeth_priv *fp = netdev_priv(q->netdev); struct funeth_txq **xdpqs, *xdp_q = NULL;
xdpqs = rcu_dereference_bh(fp->xdpqs); if (xdpqs)
xdp_q = xdpqs[smp_processor_id()];
while (budget && !cqe_phase_mismatch(q->next_cqe_info, q->phase)) { /* access other descriptor fields after the phase check */
dma_rmb();
fun_handle_cqe_pkt(q, xdp_q);
budget--;
}
if (unlikely(q->xdp_flush)) { if (q->xdp_flush & FUN_XDP_FLUSH_TX)
fun_txq_wr_db(xdp_q); if (q->xdp_flush & FUN_XDP_FLUSH_REDIR)
xdp_do_flush();
q->xdp_flush = 0;
}
return budget;
}
/* NAPI handler for Rx queues. Calls the CQE processing loop and writes RQ/CQ * doorbells as needed.
*/ int fun_rxq_napi_poll(struct napi_struct *napi, int budget)
{ struct fun_irq *irq = container_of(napi, struct fun_irq, napi); struct funeth_rxq *q = irq->rxq; int work_done = budget - fun_process_cqes(q, budget);
u32 cq_db_val = q->cq_head;
/* Before freeing the queue transfer key counters to the device. */
fp->rx_packets += q->stats.rx_pkts;
fp->rx_bytes += q->stats.rx_bytes;
fp->rx_dropped += q->stats.rx_map_err + q->stats.rx_mem_drops;
kfree(q);
}
/* Create an Rx queue's resources on the device. */ int fun_rxq_create_dev(struct funeth_rxq *q, struct fun_irq *irq)
{ struct funeth_priv *fp = netdev_priv(q->netdev); unsignedint ncqe = q->cq_mask + 1; unsignedint nrqe = q->rq_mask + 1; int err;
err = xdp_rxq_info_reg(&q->xdp_rxq, q->netdev, q->qidx,
irq->napi.napi_id); if (err) goto out;
err = xdp_rxq_info_reg_mem_model(&q->xdp_rxq, MEM_TYPE_PAGE_SHARED,
NULL); if (err) goto xdp_unreg;
/* Create or advance an Rx queue, allocating all the host and device resources * needed to reach the target state.
*/ int funeth_rxq_create(struct net_device *dev, unsignedint qidx, unsignedint ncqe, unsignedint nrqe, struct fun_irq *irq, int state, struct funeth_rxq **qp)
{ struct funeth_rxq *q = *qp; int err;
if (!q) {
q = fun_rxq_create_sw(dev, qidx, ncqe, nrqe, irq); if (IS_ERR(q)) return PTR_ERR(q);
}
if (q->init_state >= state) goto out;
err = fun_rxq_create_dev(q, irq); if (err) { if (!*qp)
fun_rxq_free_sw(q); return err;
}
out:
*qp = q; return 0;
}
/* Free Rx queue resources until it reaches the target state. */ struct funeth_rxq *funeth_rxq_free(struct funeth_rxq *q, int state)
{ if (state < FUN_QSTATE_INIT_FULL)
fun_rxq_free_dev(q);
if (state == FUN_QSTATE_DESTROYED) {
fun_rxq_free_sw(q);
q = NULL;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.