netdev_err(dev, "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d\n",
txqueue,
threshold,
time_since_last_napi,
napi_scheduled);
if (threshold < time_since_last_napi && napi_scheduled) {
netdev_err(dev, "napi handler hasn't been called for a long time but is scheduled\n");
reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
}
schedule_reset: /* Change the state of the device to trigger reset * Check that we are not in the middle or a trigger already
*/ if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) return;
staticvoid update_rx_ring_mtu(struct ena_adapter *adapter, int mtu)
{ int i;
for (i = 0; i < adapter->num_io_queues; i++)
adapter->rx_ring[i].mtu = mtu;
}
staticint ena_change_mtu(struct net_device *dev, int new_mtu)
{ struct ena_adapter *adapter = netdev_priv(dev); int ret;
ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu); if (!ret) {
netif_dbg(adapter, drv, dev, "Set MTU to %d\n", new_mtu);
update_rx_ring_mtu(adapter, new_mtu);
WRITE_ONCE(dev->mtu, new_mtu);
} else {
netif_err(adapter, drv, dev, "Failed to set MTU to %d\n",
new_mtu);
}
if (unlikely(ena_com_is_doorbell_needed(ring->ena_com_io_sq,
ena_tx_ctx))) {
netif_dbg(adapter, tx_queued, adapter->netdev, "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
ring->qid);
ena_ring_tx_doorbell(ring);
}
/* prepare the packet's descriptors to dma engine */
rc = ena_com_prepare_tx(ring->ena_com_io_sq, ena_tx_ctx,
&nb_hw_desc);
/* In case there isn't enough space in the queue for the packet, * we simply drop it. All other failure reasons of * ena_com_prepare_tx() are fatal and therefore require a device reset.
*/ if (unlikely(rc)) {
netif_err(adapter, tx_queued, adapter->netdev, "Failed to prepare tx bufs\n");
ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1, &ring->syncp); if (rc != -ENOMEM)
ena_reset_device(adapter, ENA_REGS_RESET_DRIVER_INVALID_STATE); return rc;
}
if (rx_ring->rx_buffer_info) {
netif_err(adapter, ifup, adapter->netdev, "rx_buffer_info is not NULL"); return -EEXIST;
}
/* alloc extra element so in rx path * we can always prefetch rx_info + 1
*/
size = sizeof(struct ena_rx_buffer) * (rx_ring->ring_size + 1);
node = cpu_to_node(ena_irq->cpu);
rx_ring->rx_buffer_info = vzalloc_node(size, node); if (!rx_ring->rx_buffer_info) {
rx_ring->rx_buffer_info = vzalloc(size); if (!rx_ring->rx_buffer_info) return -ENOMEM;
}
/* This would allocate the page on the same NUMA node the executing code * is running on.
*/
page = dev_alloc_page(); if (!page) {
ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1, &rx_ring->syncp); return ERR_PTR(-ENOSPC);
}
/* To enable NIC-side port-mirroring, AKA SPAN port, * we make the buffer readable from the nic as well
*/
*dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
DMA_BIDIRECTIONAL); if (unlikely(dma_mapping_error(rx_ring->dev, *dma))) {
ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1,
&rx_ring->syncp);
__free_page(page); return ERR_PTR(-EIO);
}
/* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer * for data + headroom + tailroom.
*/ if (SKB_DATA_ALIGN(len + pkt_offset) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) {
page_ref_inc(rx_info->page);
rx_info->page_offset += buf_len;
ena_buf->paddr += buf_len;
ena_buf->len -= buf_len; returntrue;
}
/* If XDP isn't loaded try to reuse part of the RX buffer */
reuse_rx_buf_page = !is_xdp_loaded &&
ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
if (!reuse_rx_buf_page)
ena_unmap_rx_buff_attrs(rx_ring, rx_info, DMA_ATTR_SKIP_CPU_SYNC);
skb = ena_alloc_skb(rx_ring, buf_addr, buf_len); if (unlikely(!skb)) return NULL;
/* Populate skb's linear part */
skb_reserve(skb, buf_offset);
skb_put(skb, len);
skb->protocol = eth_type_trans(skb, rx_ring->netdev);
do {
netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "RX skb updated. len %d. data_len %d\n",
skb->len, skb->data_len);
/* ena_rx_checksum - indicate in skb if hw indicated a good cksum * @adapter: structure containing adapter specific data * @ena_rx_ctx: received packet context/metadata * @skb: skb currently being received and modified
*/ staticvoid ena_rx_checksum(struct ena_ring *rx_ring, struct ena_com_rx_ctx *ena_rx_ctx, struct sk_buff *skb)
{ /* Rx csum disabled */ if (unlikely(!(rx_ring->netdev->features & NETIF_F_RXCSUM))) {
skb->ip_summed = CHECKSUM_NONE; return;
}
/* For fragmented packets the checksum isn't valid */ if (ena_rx_ctx->frag) {
skb->ip_summed = CHECKSUM_NONE; return;
}
/* if IP and error */ if (unlikely((ena_rx_ctx->l3_proto == ENA_ETH_IO_L3_PROTO_IPV4) &&
(ena_rx_ctx->l3_csum_err))) { /* ipv4 checksum error */
skb->ip_summed = CHECKSUM_NONE;
ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1,
&rx_ring->syncp);
netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, "RX IPv4 header checksum error\n"); return;
}
/* First descriptor might have an offset set by the device */
rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
pkt_offset = ena_rx_ctx.pkt_offset;
rx_info->buf_offset += pkt_offset;
netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
/* Rx ring can be NULL when for XDP tx queues which don't have an * accompanying rx_ring pair.
*/ if (rx_ring)
rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ?
rx_ring->smoothed_interval :
ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev);
/* It is a shared MSI-X. * Tx and Rx CQ have pointer to it. * So we use one of them to reach the intr reg * The Tx ring is used because the rx_ring is NULL for XDP queues
*/
ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg);
}
void ena_update_ring_numa_node(struct ena_ring *tx_ring, struct ena_ring *rx_ring)
{ int cpu = get_cpu(); int numa_node;
/* Check only one ring since the 2 rings are running on the same cpu */ if (likely(tx_ring->cpu == cpu)) goto out;
tx_ring->cpu = cpu; if (rx_ring)
rx_ring->cpu = cpu;
numa_node = cpu_to_node(cpu);
if (likely(tx_ring->numa_node == numa_node)) goto out;
put_cpu();
if (numa_node != NUMA_NO_NODE) {
ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node);
tx_ring->numa_node = numa_node; if (rx_ring) {
rx_ring->numa_node = numa_node;
ena_com_update_numa_node(rx_ring->ena_com_io_cq,
numa_node);
}
}
return;
out:
put_cpu();
}
staticint ena_io_poll(struct napi_struct *napi, int budget)
{ struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); struct ena_ring *tx_ring, *rx_ring; int tx_work_done; int rx_work_done = 0; int tx_budget; int napi_comp_call = 0; int ret;
tx_work_done = ena_clean_tx_irq(tx_ring, tx_budget); /* On netpoll the budget is zero and the handler should only clean the * tx completions.
*/ if (likely(budget))
rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget);
/* If the device is about to reset or down, avoid unmask * the interrupt and return 0 so NAPI won't reschedule
*/ if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags))) {
napi_complete_done(napi, 0);
ret = 0;
/* Update numa and unmask the interrupt only when schedule * from the interrupt context (vs from sk_busy_loop)
*/ if (napi_complete_done(napi, rx_work_done) &&
READ_ONCE(ena_napi->interrupts_masked)) {
smp_rmb(); /* make sure interrupts_masked is read */
WRITE_ONCE(ena_napi->interrupts_masked, false); /* We apply adaptive moderation on Rx path only. * Tx uses static interrupt moderation.
*/ if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
ena_adjust_adaptive_rx_intr_moderation(ena_napi);
/* Don't call the aenq handler before probe is done */ if (likely(test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)))
ena_com_aenq_intr_handler(adapter->ena_dev, data);
return IRQ_HANDLED;
}
/* ena_intr_msix_io - MSI-X Interrupt Handler for Tx/Rx * @irq: interrupt number * @data: pointer to a network interface private napi device structure
*/ static irqreturn_t ena_intr_msix_io(int irq, void *data)
{ struct ena_napi *ena_napi = data;
/* Used to check HW health */
WRITE_ONCE(ena_napi->first_interrupt, true);
WRITE_ONCE(ena_napi->interrupts_masked, true);
smp_wmb(); /* write interrupts_masked before calling napi */
napi_schedule_irqoff(&ena_napi->napi);
return IRQ_HANDLED;
}
/* Reserve a single MSI-X vector for management (admin + aenq). * plus reserve one vector for each potential io queue. * the number of potential io queues is the minimum of what the device * supports and the number of vCPUs.
*/ staticint ena_enable_msix(struct ena_adapter *adapter)
{ int msix_vecs, irq_cnt;
if (test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
netif_err(adapter, probe, adapter->netdev, "Error, MSI-X is already enabled\n"); return -EPERM;
}
/* Reserved the max msix vectors we might need */
msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues);
netif_dbg(adapter, probe, adapter->netdev, "Trying to enable MSI-X, vectors %d\n", msix_vecs);
if (irq_cnt < 0) {
netif_err(adapter, probe, adapter->netdev, "Failed to enable MSI-X. irq_cnt %d\n", irq_cnt); return -ENOSPC;
}
if (irq_cnt != msix_vecs) {
netif_notice(adapter, probe, adapter->netdev, "Enable only %d MSI-X (out of %d), reduce the number of queues\n",
irq_cnt, msix_vecs);
adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
}
if (netif_enable_cpu_rmap(adapter->netdev, adapter->num_io_queues))
netif_warn(adapter, probe, adapter->netdev, "Failed to map IRQs to CPUs\n");
if (!test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
netif_err(adapter, ifup, adapter->netdev, "Failed to request I/O IRQ: MSI-X is not enabled\n"); return -EINVAL;
}
for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
irq = &adapter->irq_tbl[i];
rc = request_irq(irq->vector, irq->handler, flags, irq->name,
irq->data); if (rc) {
netif_err(adapter, ifup, adapter->netdev, "Failed to request I/O IRQ. index %d rc %d\n",
i, rc); goto err;
}
netif_dbg(adapter, ifup, adapter->netdev, "Set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
i, irq->affinity_hint_mask.bits[0], irq->vector);
/* Now that IO IRQs have been successfully allocated map them to the * corresponding IO NAPI instance. Note that the mgmnt IRQ does not * have a NAPI, so care must be taken to correctly map IRQs to NAPIs.
*/ for (i = 0; i < io_queue_count; i++) {
irq_idx = ENA_IO_IRQ_IDX(i);
irq = &adapter->irq_tbl[irq_idx];
netif_napi_set_irq(&adapter->ena_napi[i].napi, irq->vector);
}
return rc;
err: for (k = ENA_IO_IRQ_FIRST_IDX; k < i; k++) {
irq = &adapter->irq_tbl[k];
free_irq(irq->vector, irq->data);
}
staticvoid ena_init_napi_in_range(struct ena_adapter *adapter, int first_index, int count)
{ int (*napi_handler)(struct napi_struct *napi, int budget); int i;
for (i = first_index; i < first_index + count; i++) { struct ena_napi *napi = &adapter->ena_napi[i]; struct ena_ring *rx_ring, *tx_ring;
if (!ENA_IS_XDP_INDEX(adapter, i))
napi->rx_ring = rx_ring;
napi->tx_ring = tx_ring;
napi->qid = i;
}
}
staticvoid ena_napi_disable_in_range(struct ena_adapter *adapter, int first_index, int count)
{ struct napi_struct *napi; int i;
for (i = first_index; i < first_index + count; i++) {
napi = &adapter->ena_napi[i].napi; if (!ENA_IS_XDP_INDEX(adapter, i)) { /* This API is supported for non-XDP queues only */
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_TX, NULL);
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_RX, NULL);
}
napi_disable(napi);
}
}
staticvoid ena_napi_enable_in_range(struct ena_adapter *adapter, int first_index, int count)
{ struct napi_struct *napi; int i;
for (i = first_index; i < first_index + count; i++) {
napi = &adapter->ena_napi[i].napi;
napi_enable(napi); if (!ENA_IS_XDP_INDEX(adapter, i)) { /* This API is supported for non-XDP queues only */
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_RX, napi);
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_TX, napi);
}
}
}
/* Configure the Rx forwarding */ staticint ena_rss_configure(struct ena_adapter *adapter)
{ struct ena_com_dev *ena_dev = adapter->ena_dev; int rc;
/* In case the RSS table wasn't initialized by probe */ if (!ena_dev->rss.tbl_log_size) {
rc = ena_rss_init_default(adapter); if (rc && (rc != -EOPNOTSUPP)) {
netif_err(adapter, ifup, adapter->netdev, "Failed to init RSS rc: %d\n", rc); return rc;
}
}
/* Set indirect table */
rc = ena_com_indirect_table_set(ena_dev); if (unlikely(rc && rc != -EOPNOTSUPP)) return rc;
/* Configure hash function (if supported) */
rc = ena_com_set_hash_function(ena_dev); if (unlikely(rc && (rc != -EOPNOTSUPP))) return rc;
int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter, int first_index, int count)
{ struct ena_com_dev *ena_dev = adapter->ena_dev; int rc, i;
for (i = first_index; i < first_index + count; i++) {
rc = ena_create_io_tx_queue(adapter, i); if (rc) goto create_err;
}
return 0;
create_err: while (i-- > first_index)
ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i));
for (i = 0; i < adapter->num_io_queues; i++) {
rc = ena_create_io_rx_queue(adapter, i); if (rc) goto create_err;
INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
create_err: while (i--) {
ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]);
cancel_work_sync(&adapter->ena_napi[i].dim.work);
ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i));
}
return rc;
}
staticvoid set_io_rings_size(struct ena_adapter *adapter, int new_tx_size, int new_rx_size)
{ int i;
for (i = 0; i < adapter->num_io_queues; i++) {
adapter->tx_ring[i].ring_size = new_tx_size;
adapter->rx_ring[i].ring_size = new_rx_size;
}
}
/* This function allows queue allocation to backoff when the system is * low on memory. If there is not enough memory to allocate io queues * the driver will try to allocate smaller queues. * * The backoff algorithm is as follows: * 1. Try to allocate TX and RX and if successful. * 1.1. return success * * 2. Divide by 2 the size of the larger of RX and TX queues (or both if their size is the same). * * 3. If TX or RX is smaller than 256 * 3.1. return failure. * 4. else * 4.1. go back to 1.
*/ staticint create_queues_with_size_backoff(struct ena_adapter *adapter)
{ int rc, cur_rx_ring_size, cur_tx_ring_size; int new_rx_ring_size, new_tx_ring_size;
/* current queue sizes might be set to smaller than the requested * ones due to past queue allocation failures.
*/
set_io_rings_size(adapter, adapter->requested_tx_ring_size,
adapter->requested_rx_ring_size);
while (1) { if (ena_xdp_present(adapter)) {
rc = ena_setup_and_create_all_xdp_queues(adapter);
if (rc) goto err_setup_tx;
}
rc = ena_setup_tx_resources_in_range(adapter,
0,
adapter->num_io_queues); if (rc) goto err_setup_tx;
rc = ena_create_io_tx_queues_in_range(adapter,
0,
adapter->num_io_queues); if (rc) goto err_create_tx_queues;
rc = ena_setup_all_rx_resources(adapter); if (rc) goto err_setup_rx;
rc = ena_create_all_io_rx_queues(adapter); if (rc) goto err_create_rx_queues;
/* Decrease the size of the larger queue, or * decrease both if they are the same size.
*/ if (cur_rx_ring_size <= cur_tx_ring_size)
new_tx_ring_size = cur_tx_ring_size / 2; if (cur_rx_ring_size >= cur_tx_ring_size)
new_rx_ring_size = cur_rx_ring_size / 2;
if (new_tx_ring_size < ENA_MIN_RING_SIZE ||
new_rx_ring_size < ENA_MIN_RING_SIZE) {
netif_err(adapter, ifup, adapter->netdev, "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n",
ENA_MIN_RING_SIZE); return rc;
}
/* napi poll functions should be initialized before running * request_irq(), to handle a rare condition where there is a pending * interrupt, causing the ISR to fire immediately while the poll * function wasn't set yet, causing a null dereference
*/
ena_init_napi_in_range(adapter, 0, io_queue_count);
/* Enabling DIM needs to happen before enabling IRQs since DIM * is run from napi routine
*/ if (ena_com_interrupt_moderation_supported(adapter->ena_dev))
ena_com_enable_adaptive_moderation(adapter->ena_dev);
rc = ena_request_io_irq(adapter); if (rc) goto err_req_irq;
rc = create_queues_with_size_backoff(adapter); if (rc) goto err_create_queues_with_backoff;
rc = ena_up_complete(adapter); if (rc) goto err_up;
if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
netif_carrier_on(adapter->netdev);
/* Enable completion queues interrupt */ for (i = 0; i < adapter->num_io_queues; i++)
ena_unmask_interrupt(&adapter->tx_ring[i],
&adapter->rx_ring[i]);
/* schedule napi in case we had pending packets * from the last time we disable napi
*/ for (i = 0; i < io_queue_count; i++)
napi_schedule(&adapter->ena_napi[i].napi);
/* ena_open - Called when a network interface is made active * @netdev: network interface device structure * * Returns 0 on success, negative value on failure * * The open entry point is called when a network interface is made * active by the system (IFF_UP). At this point all resources needed * for transmit and receive operations are allocated, the interrupt * handler is registered with the OS, the watchdog timer is started, * and the stack is notified that the interface is ready.
*/ staticint ena_open(struct net_device *netdev)
{ struct ena_adapter *adapter = netdev_priv(netdev); int rc;
/* Notify the stack of the actual queue counts. */
rc = netif_set_real_num_tx_queues(netdev, adapter->num_io_queues); if (rc) {
netif_err(adapter, ifup, netdev, "Can't set num tx queues\n"); return rc;
}
rc = netif_set_real_num_rx_queues(netdev, adapter->num_io_queues); if (rc) {
netif_err(adapter, ifup, netdev, "Can't set num rx queues\n"); return rc;
}
rc = ena_up(adapter); if (rc) return rc;
return rc;
}
/* ena_close - Disables a network interface * @netdev: network interface device structure * * Returns 0, this is not allowed to fail * * The close entry point is called when an interface is de-activated * by the OS. The hardware is still under the drivers control, but * needs to be disabled. A global MAC reset is issued to stop the * hardware, and all transmit and receive resources are freed.
*/ staticint ena_close(struct net_device *netdev)
{ struct ena_adapter *adapter = netdev_priv(netdev);
/* a check that the configuration is valid is done by caller */ if (large_llq_changed) {
adapter->large_llq_header_enabled = !adapter->large_llq_header_enabled;
/* We need to destroy the rss table so that the indirection * table will be reinitialized by ena_up()
*/
ena_com_rss_destroy(ena_dev);
ena_init_io_rings(adapter,
0,
adapter->xdp_num_queues +
adapter->num_io_queues); return dev_was_up ? ena_open(adapter->netdev) : 0;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.