/* * Linux driver for VMware's vmxnet3 ethernet NIC. * * Copyright (C) 2008-2024, VMware, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; version 2 of the License and no later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * * Maintained by: pv-drivers@vmware.com *
*/
/* * PCI Device ID Table * Last entry must be all 0s
*/ staticconststruct pci_device_id vmxnet3_pciid_table[] = {
{PCI_VDEVICE(VMWARE, PCI_DEVICE_ID_VMWARE_VMXNET3)},
{0}
};
/* Check if capability is supported by UPT device or * UPT is even requested
*/ bool
vmxnet3_check_ptcapability(u32 cap_supported, u32 cap)
{ if (cap_supported & (1UL << VMXNET3_DCR_ERROR) ||
cap_supported & (1UL << cap)) { returntrue;
}
returnfalse;
}
/* * Check the link state. This may start or stop the tx queue.
*/ staticvoid
vmxnet3_check_link(struct vmxnet3_adapter *adapter, bool affectTxQueue)
{
u32 ret; int i; unsignedlong flags;
adapter->link_speed = ret >> 16; if (ret & 1) { /* Link is up. */ /* * From vmxnet3 v9, the hypervisor reports the speed in Gbps. * Convert the speed to Mbps before rporting it to the kernel. * Max link speed supported is 10000G.
*/ if (VMXNET3_VERSION_GE_9(adapter) &&
adapter->link_speed < 10000)
adapter->link_speed = adapter->link_speed * 1000;
netdev_info(adapter->netdev, "NIC Link is Up %d Mbps\n",
adapter->link_speed);
netif_carrier_on(adapter->netdev);
if (affectTxQueue) { for (i = 0; i < adapter->num_tx_queues; i++)
vmxnet3_tq_start(&adapter->tx_queue[i],
adapter);
}
} else {
netdev_info(adapter->netdev, "NIC Link is Down\n");
netif_carrier_off(adapter->netdev);
if (affectTxQueue) { for (i = 0; i < adapter->num_tx_queues; i++)
vmxnet3_tq_stop(&adapter->tx_queue[i], adapter);
}
}
}
staticvoid
vmxnet3_process_events(struct vmxnet3_adapter *adapter)
{ int i; unsignedlong flags;
u32 events = le32_to_cpu(adapter->shared->ecr); if (!events) return;
vmxnet3_ack_events(adapter, events);
/* Check if link state has changed */ if (events & VMXNET3_ECR_LINK)
vmxnet3_check_link(adapter, true);
/* Check if there is an error on xmit/recv queues */ if (events & (VMXNET3_ECR_TQERR | VMXNET3_ECR_RQERR)) {
spin_lock_irqsave(&adapter->cmd_lock, flags);
VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
VMXNET3_CMD_GET_QUEUE_STATUS);
spin_unlock_irqrestore(&adapter->cmd_lock, flags);
for (i = 0; i < adapter->num_tx_queues; i++) if (adapter->tqd_start[i].status.stopped)
dev_err(&adapter->netdev->dev, "%s: tq[%d] error 0x%x\n",
adapter->netdev->name, i, le32_to_cpu(
adapter->tqd_start[i].status.error)); for (i = 0; i < adapter->num_rx_queues; i++) if (adapter->rqd_start[i].status.stopped)
dev_err(&adapter->netdev->dev, "%s: rq[%d] error 0x%x\n",
adapter->netdev->name, i,
adapter->rqd_start[i].status.error);
schedule_work(&adapter->work);
}
}
#ifdef __BIG_ENDIAN_BITFIELD /* * The device expects the bitfields in shared structures to be written in * little endian. When CPU is big endian, the following routines are used to * correctly read and write into ABI. * The general technique used here is : double word bitfields are defined in * opposite order for big endian architecture. Then before reading them in * driver the complete double word is translated using le32_to_cpu. Similarly * After the driver writes into bitfields, cpu_to_le32 is used to translate the * double words into required format. * In order to avoid touching bits in shared structure more than once, temporary * descriptors are used. These are passed as srcDesc to following functions.
*/ staticvoid vmxnet3_RxDescToCPU(conststruct Vmxnet3_RxDesc *srcDesc, struct Vmxnet3_RxDesc *dstDesc)
{
u32 *src = (u32 *)srcDesc + 2;
u32 *dst = (u32 *)dstDesc + 2;
dstDesc->addr = le64_to_cpu(srcDesc->addr);
*dst = le32_to_cpu(*src);
dstDesc->ext1 = le32_to_cpu(srcDesc->ext1);
}
/* no out of order completion */
BUG_ON(tq->buf_info[eop_idx].sop_idx != tq->tx_ring.next2comp);
BUG_ON(VMXNET3_TXDESC_GET_EOP(&(tq->tx_ring.base[eop_idx].txd)) != 1);
while (tq->tx_ring.next2comp != eop_idx) {
vmxnet3_unmap_tx_buf(tq->buf_info + tq->tx_ring.next2comp,
pdev);
/* update next2comp w/o tx_lock. Since we are marking more, * instead of less, tx ring entries avail, the worst case is * that the tx routine incorrectly re-queues a pkt due to * insufficient tx ring entries.
*/
vmxnet3_cmd_ring_adv_next2comp(&tq->tx_ring);
entries++;
}
if (map_type & VMXNET3_MAP_XDP)
xdp_return_frame_bulk(tbi->xdpf, bq); else
dev_kfree_skb_any(tbi->skb);
/* xdpf and skb are in an anonymous union. */
tbi->skb = NULL;
return entries;
}
staticint
vmxnet3_tq_tx_complete(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter)
{ union Vmxnet3_GenericDesc *gdesc; struct xdp_frame_bulk bq; int completed = 0;
xdp_frame_bulk_init(&bq);
rcu_read_lock();
gdesc = tq->comp_ring.base + tq->comp_ring.next2proc; while (VMXNET3_TCD_GET_GEN(&gdesc->tcd) == tq->comp_ring.gen) { /* Prevent any &gdesc->tcd field from being (speculatively) * read before (&gdesc->tcd)->gen is read.
*/
dma_rmb();
vmxnet3_unmap_tx_buf(tbi, adapter->pdev); if (tbi->skb) { if (map_type & VMXNET3_MAP_XDP)
xdp_return_frame_bulk(tbi->xdpf, &bq); else
dev_kfree_skb_any(tbi->skb);
tbi->skb = NULL;
}
vmxnet3_cmd_ring_adv_next2comp(&tq->tx_ring);
}
xdp_flush_frame_bulk(&bq);
rcu_read_unlock();
/* sanity check, verify all buffers are indeed unmapped */ for (i = 0; i < tq->tx_ring.size; i++)
BUG_ON(tq->buf_info[i].map_type != VMXNET3_MAP_NONE);
/* Destroy all tx queues */ void
vmxnet3_tq_destroy_all(struct vmxnet3_adapter *adapter)
{ int i;
for (i = 0; i < adapter->num_tx_queues; i++)
vmxnet3_tq_destroy(&adapter->tx_queue[i], adapter);
}
staticvoid
vmxnet3_tq_init(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter)
{ int i;
/* reset the tx ring contents to 0 and reset the tx ring states */
memset(tq->tx_ring.base, 0, tq->tx_ring.size * sizeof(struct Vmxnet3_TxDesc));
tq->tx_ring.next2fill = tq->tx_ring.next2comp = 0;
tq->tx_ring.gen = VMXNET3_INIT_GEN;
if (tq->ts_ring.base)
memset(tq->ts_ring.base, 0,
tq->tx_ring.size * tq->tx_ts_desc_size);
/* reset the tx comp ring contents to 0 and reset comp ring states */
memset(tq->comp_ring.base, 0, tq->comp_ring.size * sizeof(struct Vmxnet3_TxCompDesc));
tq->comp_ring.next2proc = 0;
tq->comp_ring.gen = VMXNET3_INIT_GEN;
/* reset the bookkeeping data */
memset(tq->buf_info, 0, sizeof(tq->buf_info[0]) * tq->tx_ring.size); for (i = 0; i < tq->tx_ring.size; i++)
tq->buf_info[i].map_type = VMXNET3_MAP_NONE;
staticvoid
vmxnet3_tq_cleanup_all(struct vmxnet3_adapter *adapter)
{ int i;
for (i = 0; i < adapter->num_tx_queues; i++)
vmxnet3_tq_cleanup(&adapter->tx_queue[i], adapter);
}
/* * starting from ring->next2fill, allocate rx buffers for the given ring * of the rx queue and update the rx desc. stop after @num_to_alloc buffers * are allocated or allocation fails
*/
/* Fill the last buffer but dont mark it ready, or else the
* device will think that the queue is full */ if (num_allocated == num_to_alloc) {
rbi->comp_state = VMXNET3_RXD_COMP_DONE; break;
}
/* use the previous gen bit for the SOP desc */
dw2 = (tq->tx_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
ctx->sop_txd = tq->tx_ring.base + tq->tx_ring.next2fill;
gdesc = ctx->sop_txd; /* both loops below can be skipped */
/* no need to map the buffer if headers are copied */ if (ctx->copy_size) {
ctx->sop_txd->txd.addr = cpu_to_le64(tq->data_ring.basePA +
tq->tx_ring.next2fill *
tq->txdata_desc_size);
ctx->sop_txd->dword[2] = cpu_to_le32(dw2 | ctx->copy_size);
ctx->sop_txd->dword[3] = 0;
/* use the right gen for non-SOP desc */
dw2 = tq->tx_ring.gen << VMXNET3_TXD_GEN_SHIFT;
}
/* linear part can use multiple tx desc if it's big */
len = skb_headlen(skb) - ctx->copy_size;
buf_offset = ctx->copy_size; while (len) {
u32 buf_size;
if (len < VMXNET3_MAX_TX_BUF_SIZE) {
buf_size = len;
dw2 |= len;
} else {
buf_size = VMXNET3_MAX_TX_BUF_SIZE; /* spec says that for TxDesc.len, 0 == 2^14 */
}
/* set the last buf_info for the pkt */
tbi->skb = skb;
tbi->sop_idx = ctx->sop_txd - tq->tx_ring.base; if (tq->tx_ts_desc_size != 0) {
ctx->ts_txd = (struct Vmxnet3_TxTSDesc *)((u8 *)tq->ts_ring.base +
tbi->sop_idx * tq->tx_ts_desc_size);
ctx->ts_txd->ts.tsi = 0;
}
return 0;
}
/* Init all tx queues */ staticvoid
vmxnet3_tq_init_all(struct vmxnet3_adapter *adapter)
{ int i;
for (i = 0; i < adapter->num_tx_queues; i++)
vmxnet3_tq_init(&adapter->tx_queue[i], adapter);
}
/* * parse relevant protocol headers: * For a tso pkt, relevant headers are L2/3/4 including options * For a pkt requesting csum offloading, they are L2/3 and may include L4 * if it's a TCP/UDP pkt * * Returns: * -1: error happens during parsing * 0: protocol headers parsed, but too big to be copied * 1: protocol headers parsed and copied * * Other effects: * 1. related *ctx fields are updated. * 2. ctx->copy_size is # of bytes copied * 3. the portion to be copied is guaranteed to be in the linear part *
*/ staticint
vmxnet3_parse_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, struct vmxnet3_tx_ctx *ctx, struct vmxnet3_adapter *adapter)
{
u8 protocol = 0;
if (ctx->mss) { /* TSO */ if (VMXNET3_VERSION_GE_4(adapter) && skb->encapsulation) {
ctx->l4_offset = skb_inner_transport_offset(skb);
ctx->l4_hdr_size = inner_tcp_hdrlen(skb);
ctx->copy_size = ctx->l4_offset + ctx->l4_hdr_size;
} else {
ctx->l4_offset = skb_transport_offset(skb);
ctx->l4_hdr_size = tcp_hdrlen(skb);
ctx->copy_size = ctx->l4_offset + ctx->l4_hdr_size;
}
} else { if (skb->ip_summed == CHECKSUM_PARTIAL) { /* For encap packets, skb_checksum_start_offset refers * to inner L4 offset. Thus, below works for encap as * well as non-encap case
*/
ctx->l4_offset = skb_checksum_start_offset(skb);
if (VMXNET3_VERSION_GE_4(adapter) &&
skb->encapsulation) { struct iphdr *iph = inner_ip_hdr(skb);
/* * copy relevant protocol headers to the transmit ring: * For a tso pkt, relevant headers are L2/3/4 including options * For a pkt requesting csum offloading, they are L2/3 and may include L4 * if it's a TCP/UDP pkt * * * Note that this requires that vmxnet3_parse_hdr be called first to set the * appropriate bits in ctx first
*/ staticvoid
vmxnet3_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, struct vmxnet3_tx_ctx *ctx, struct vmxnet3_adapter *adapter)
{ struct Vmxnet3_TxDataDesc *tdd;
/* * Transmits a pkt thru a given tq * Returns: * NETDEV_TX_OK: descriptors are setup successfully * NETDEV_TX_OK: error occurred, the pkt is dropped * NETDEV_TX_BUSY: tx ring is full, queue is stopped * * Side-effects: * 1. tx ring may be changed * 2. tq stats may be updated accordingly * 3. shared->txNumDeferred may be updated
*/
staticint
vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter, struct net_device *netdev)
{ int ret;
u32 count; int num_pkts; int tx_num_deferred; unsignedlong flags; struct vmxnet3_tx_ctx ctx; union Vmxnet3_GenericDesc *gdesc; #ifdef __BIG_ENDIAN_BITFIELD /* Use temporary descriptor to avoid touching bits multiple times */ union Vmxnet3_GenericDesc tempTxDesc; #endif
ctx.mss = skb_shinfo(skb)->gso_size; if (ctx.mss) { if (skb_header_cloned(skb)) { if (unlikely(pskb_expand_head(skb, 0, 0,
GFP_ATOMIC) != 0)) {
tq->stats.drop_tso++; goto drop_pkt;
}
tq->stats.copy_skb_header++;
} if (unlikely(count > VMXNET3_MAX_TSO_TXD_PER_PKT)) { /* tso pkts must not use more than * VMXNET3_MAX_TSO_TXD_PER_PKT entries
*/ if (skb_linearize(skb) != 0) {
tq->stats.drop_too_many_frags++; goto drop_pkt;
}
tq->stats.linearized++;
/* recalculate the # of descriptors to use */
count = VMXNET3_TXD_NEEDED(skb_headlen(skb)) + 1; if (unlikely(count > VMXNET3_MAX_TSO_TXD_PER_PKT)) {
tq->stats.drop_too_many_frags++; goto drop_pkt;
}
} if (skb->encapsulation) {
vmxnet3_prepare_inner_tso(skb, &ctx);
} else {
vmxnet3_prepare_tso(skb, &ctx);
}
} else { if (unlikely(count > VMXNET3_MAX_TXD_PER_PKT)) {
/* non-tso pkts must not use more than * VMXNET3_MAX_TXD_PER_PKT entries
*/ if (skb_linearize(skb) != 0) {
tq->stats.drop_too_many_frags++; goto drop_pkt;
}
tq->stats.linearized++;
/* recalculate the # of descriptors to use */
count = VMXNET3_TXD_NEEDED(skb_headlen(skb)) + 1;
}
}
ret = vmxnet3_parse_hdr(skb, tq, &ctx, adapter); if (ret >= 0) {
BUG_ON(ret <= 0 && ctx.copy_size != 0); /* hdrs parsed, check against other limits */ if (ctx.mss) { if (unlikely(ctx.l4_offset + ctx.l4_hdr_size >
VMXNET3_MAX_TX_BUF_SIZE)) {
tq->stats.drop_oversized_hdr++; goto drop_pkt;
}
} else { if (skb->ip_summed == CHECKSUM_PARTIAL) { if (unlikely(ctx.l4_offset +
skb->csum_offset >
VMXNET3_MAX_CSUM_OFFSET)) {
tq->stats.drop_oversized_hdr++; goto drop_pkt;
}
}
}
} else {
tq->stats.drop_hdr_inspect_err++; goto drop_pkt;
}
spin_lock_irqsave(&tq->tx_lock, flags);
if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
tq->stats.tx_ring_full++;
netdev_dbg(adapter->netdev, "tx queue stopped on %s, next2comp %u" " next2fill %u\n", adapter->netdev->name,
tq->tx_ring.next2comp, tq->tx_ring.next2fill);
/* * We do not unmap and chain the rx buffer to the skb. * We basically pretend this buffer is not used and will be recycled * by vmxnet3_rq_alloc_rx_buf()
*/
/* * ctx->skb may be NULL if this is the first and the only one * desc for the pkt
*/ if (ctx->skb)
dev_kfree_skb_irq(ctx->skb);
vmxnet3_getRxComp(rcd, &rq->comp_ring.base[rq->comp_ring.next2proc].rcd,
&rxComp); while (rcd->gen == rq->comp_ring.gen) { struct vmxnet3_rx_buf_info *rbi; struct sk_buff *skb, *new_skb = NULL; struct page *new_page = NULL;
dma_addr_t new_dma_addr; int num_to_alloc; struct Vmxnet3_RxDesc *rxd;
u32 idx, ring_idx; struct vmxnet3_cmd_ring *ring = NULL; if (num_pkts >= quota) { /* we may stop even before we see the EOP desc of * the current pkt
*/ break;
}
/* Prevent any rcd field from being (speculatively) read before * rcd->gen is read.
*/
dma_rmb();
/* non SOP buffer must be type 1 in most cases */
BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_PAGE);
BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_BODY);
/* If an sop buffer was dropped, skip all * following non-sop fragments. They will be reused.
*/ if (skip_page_frags) goto rcd_done;
if (rcd->len) {
new_page = alloc_page(GFP_ATOMIC); /* Replacement page frag could not be allocated. * Reuse this page. Drop the pkt and free the * skb which contained this page as a frag. Skip * processing all the following non-sop frags.
*/ if (unlikely(!new_page)) {
rq->stats.rx_buf_alloc_failure++;
dev_kfree_skb(ctx->skb);
ctx->skb = NULL;
skip_page_frags = true; goto rcd_done;
}
new_dma_addr = dma_map_page(&adapter->pdev->dev,
new_page,
0, PAGE_SIZE,
DMA_FROM_DEVICE); if (dma_mapping_error(&adapter->pdev->dev,
new_dma_addr)) {
put_page(new_page);
rq->stats.rx_buf_alloc_failure++;
dev_kfree_skb(ctx->skb);
ctx->skb = NULL;
skip_page_frags = true; goto rcd_done;
}
/* Use GRO callback if UPT is enabled */ if ((adapter->netdev->features & NETIF_F_LRO) &&
!rq->shared->updateRxProd)
netif_receive_skb(skb); else
napi_gro_receive(&rq->napi, skb);
ctx->skb = NULL;
encap_lro = false;
num_pkts++;
}
rcd_done: /* device may have skipped some rx descs */
ring = rq->rx_ring + ring_idx;
rbi->comp_state = VMXNET3_RXD_COMP_DONE;
if (xdp_rxq_info_is_reg(&rq->xdp_rxq))
xdp_rxq_info_unreg(&rq->xdp_rxq);
page_pool_destroy(rq->page_pool);
rq->page_pool = NULL;
}
staticvoid
vmxnet3_rq_cleanup_all(struct vmxnet3_adapter *adapter)
{ int i;
for (i = 0; i < adapter->num_rx_queues; i++)
vmxnet3_rq_cleanup(&adapter->rx_queue[i], adapter);
rcu_assign_pointer(adapter->xdp_bpf_prog, NULL);
}
staticvoid vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, struct vmxnet3_adapter *adapter)
{ int i; int j;
/* all rx buffers must have already been freed */ for (i = 0; i < 2; i++) { if (rq->buf_info[i]) { for (j = 0; j < rq->rx_ring[i].size; j++)
BUG_ON(rq->buf_info[i][j].page != NULL);
}
}
for (i = 0; i < 2; i++) { if (rq->rx_ring[i].base) {
dma_free_coherent(&adapter->pdev->dev,
rq->rx_ring[i].size
* sizeof(struct Vmxnet3_RxDesc),
rq->rx_ring[i].base,
rq->rx_ring[i].basePA);
rq->rx_ring[i].base = NULL;
}
}
/* Multiple queue aware polling function for tx and rx */
staticint
vmxnet3_do_poll(struct vmxnet3_adapter *adapter, int budget)
{ int rcd_done = 0, i; if (unlikely(adapter->shared->ecr))
vmxnet3_process_events(adapter); for (i = 0; i < adapter->num_tx_queues; i++)
vmxnet3_tq_tx_complete(&adapter->tx_queue[i], adapter);
for (i = 0; i < adapter->num_rx_queues; i++)
rcd_done += vmxnet3_rq_rx_complete(&adapter->rx_queue[i],
adapter, budget); return rcd_done;
}
staticint
vmxnet3_poll(struct napi_struct *napi, int budget)
{ struct vmxnet3_rx_queue *rx_queue = container_of(napi, struct vmxnet3_rx_queue, napi); int rxd_done;
/* When sharing interrupt with corresponding tx queue, process * tx completions in that queue as well
*/ if (adapter->share_intr == VMXNET3_INTR_BUDDYSHARE) { struct vmxnet3_tx_queue *tq =
&adapter->tx_queue[rq - adapter->rx_queue];
vmxnet3_tq_tx_complete(tq, adapter);
}
if (adapter->intr.mask_mode == VMXNET3_IMM_ACTIVE)
vmxnet3_disable_intr(adapter, tq->comp_ring.intr_idx);
/* Handle the case where only one irq is allocate for all tx queues */ if (adapter->share_intr == VMXNET3_INTR_TXSHARE) { int i; for (i = 0; i < adapter->num_tx_queues; i++) { struct vmxnet3_tx_queue *txq = &adapter->tx_queue[i];
vmxnet3_tq_tx_complete(txq, adapter);
}
} else {
vmxnet3_tq_tx_complete(tq, adapter);
}
vmxnet3_enable_intr(adapter, tq->comp_ring.intr_idx);
return IRQ_HANDLED;
}
/* * Handle completion interrupts on rx queues. Returns whether or not the * intr is handled
*/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.