netdev_err(dev, "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d\n",
txqueue,
threshold,
time_since_last_napi,
napi_scheduled);
if (threshold < time_since_last_napi && napi_scheduled) {
netdev_err(dev, "napi handler hasn't been called for a long time but is scheduled\n");
reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
}
schedule_reset: /* Change the state of the device to trigger reset * Check that we are not in the middle or a trigger already
*/ if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) return;
staticvoid update_rx_ring_mtu(struct ena_adapter *adapter, int mtu)
{ int i;
for (i = 0; i < adapter->num_io_queues; i++)
adapter->rx_ring[i].mtu = mtu;
}
staticint ena_change_mtu(struct net_device *dev, int new_mtu)
{ struct ena_adapter *adapter = netdev_priv(dev); int ret;
ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu); if (!ret) {
netif_dbg(adapter, drv, dev, "Set MTU to %d\n", new_mtu);
update_rx_ring_mtu(adapter, new_mtu);
WRITE_ONCE(dev->mtu, new_mtu);
} else {
netif_err(adapter, drv, dev, "Failed to set MTU to %d\n",
new_mtu);
}
if (unlikely(ena_com_is_doorbell_needed(ring->ena_com_io_sq,
ena_tx_ctx))) {
netif_dbg(adapter, tx_queued, adapter->netdev, "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
ring->qid);
ena_ring_tx_doorbell(ring);
}
/* prepare the packet's descriptors to dma engine */
rc = ena_com_prepare_tx(ring->ena_com_io_sq, ena_tx_ctx,
&nb_hw_desc);
/* In case there isn't enough space in the queue for the packet, * we simply drop it. All other failure reasons of * ena_com_prepare_tx() are fatal and therefore require a device reset.
*/ if (unlikely(rc)) {
netif_err(adapter, tx_queued, adapter->netdev, "Failed to prepare tx bufs\n");
ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1, &ring->syncp); if (rc != -ENOMEM)
ena_reset_device(adapter, ENA_REGS_RESET_DRIVER_INVALID_STATE); return rc;
}
if (rx_ring->rx_buffer_info) {
netif_err(adapter, ifup, adapter->netdev, "rx_buffer_info is not NULL"); return -EEXIST;
}
/* alloc extra element so in rx path * we can always prefetch rx_info + 1
*/
size = sizeof(struct ena_rx_buffer) * (rx_ring->ring_size + 1);
node = cpu_to_node(ena_irq->cpu);
rx_ring->rx_buffer_info = vzalloc_node(size, node); if (!rx_ring->rx_buffer_info) {
rx_ring->rx_buffer_info = vzalloc(size); if (!rx_ring->rx_buffer_info) return -ENOMEM;
}
/* This would allocate the page on the same NUMA node the executing code * is running on.
*/
page = dev_alloc_page(); if (!page) {
ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1, &rx_ring->syncp); return ERR_PTR(-ENOSPC);
}
/* To enable NIC-side port-mirroring, AKA SPAN port, * we make the buffer readable from the nic as well
*/
*dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
DMA_BIDIRECTIONAL); if (unlikely(dma_mapping_error(rx_ring->dev, *dma))) {
ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1,
&rx_ring->syncp);
__free_page(page); return ERR_PTR(-EIO);
}
/* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer * for data + headroom + tailroom.
*/ if (SKB_DATA_ALIGN(len + pkt_offset) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) {
page_ref_inc(rx_info->page);
rx_info->page_offset += buf_len;
ena_buf->paddr += buf_len;
ena_buf->len -= buf_len; returntrue;
}
/* If XDP isn't loaded try to reuse part of the RX buffer */
reuse_rx_buf_page = !is_xdp_loaded &&
ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
if (!reuse_rx_buf_page)
ena_unmap_rx_buff_attrs(rx_ring, rx_info, DMA_ATTR_SKIP_CPU_SYNC);
skb = ena_alloc_skb(rx_ring, buf_addr, buf_len); if (unlikely(!skb)) return NULL;
/* Populate skb's linear part */
skb_reserve(skb, buf_offset);
skb_put(skb, len);
skb->protocol = eth_type_trans(skb, rx_ring->netdev);
do {
netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "RX skb updated. len %d. data_len %d\n",
skb->len, skb->data_len);
/* ena_rx_checksum - indicate in skb if hw indicated a good cksum * @adapter: structure containing adapter specific data * @ena_rx_ctx: received packet context/metadata * @skb: skb currently being received and modified
*/ staticvoid ena_rx_checksum(struct ena_ring *rx_ring, struct ena_com_rx_ctx *ena_rx_ctx, struct sk_buff *skb)
{ /* Rx csum disabled */ if (unlikely(!(rx_ring->netdev->features & NETIF_F_RXCSUM))) {
skb->ip_summed = CHECKSUM_NONE; return;
}
/* For fragmented packets the checksum isn't valid */ if (ena_rx_ctx->frag) {
skb->ip_summed = CHECKSUM_NONE; return;
}
/* if IP and error */ if (unlikely((ena_rx_ctx->l3_proto == ENA_ETH_IO_L3_PROTO_IPV4) &&
(ena_rx_ctx->l3_csum_err))) { /* ipv4 checksum error */
skb->ip_summed = CHECKSUM_NONE;
ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1,
&rx_ring->syncp);
netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev, "RX IPv4 header checksum error\n"); return;
}
/* First descriptor might have an offset set by the device */
rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
pkt_offset = ena_rx_ctx.pkt_offset;
rx_info->buf_offset += pkt_offset;
netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
/* Rx ring can be NULL when for XDP tx queues which don't have an * accompanying rx_ring pair.
*/ if (rx_ring)
rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ?
rx_ring->smoothed_interval :
ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev);
/* It is a shared MSI-X. * Tx and Rx CQ have pointer to it. * So we use one of them to reach the intr reg * The Tx ring is used because the rx_ring is NULL for XDP queues
*/
ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg);
}
void ena_update_ring_numa_node(struct ena_ring *tx_ring, struct ena_ring *rx_ring)
{ int cpu = get_cpu(); int numa_node;
/* Check only one ring since the 2 rings are running on the same cpu */ if (likely(tx_ring->cpu == cpu)) goto out;
tx_ring->cpu = cpu; if (rx_ring)
rx_ring->cpu = cpu;
numa_node = cpu_to_node(cpu);
if (likely(tx_ring->numa_node == numa_node)) goto out;
put_cpu();
if (numa_node != NUMA_NO_NODE) {
ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node);
tx_ring->numa_node = numa_node; if (rx_ring) {
rx_ring->numa_node = numa_node;
ena_com_update_numa_node(rx_ring->ena_com_io_cq,
numa_node);
}
}
return;
out:
put_cpu();
}
staticint ena_io_poll(struct napi_struct *napi, int budget)
{ struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi); struct ena_ring *tx_ring, *rx_ring; int tx_work_done; int rx_work_done = 0; int tx_budget; int napi_comp_call = 0; int ret;
tx_work_done = ena_clean_tx_irq(tx_ring, tx_budget); /* On netpoll the budget is zero and the handler should only clean the * tx completions.
*/ if (likely(budget))
rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget);
/* If the device is about to reset or down, avoid unmask * the interrupt and return 0 so NAPI won't reschedule
*/ if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags))) {
napi_complete_done(napi, 0);
ret = 0;
/* Update numa and unmask the interrupt only when schedule * from the interrupt context (vs from sk_busy_loop)
*/ if (napi_complete_done(napi, rx_work_done) &&
READ_ONCE(ena_napi->interrupts_masked)) {
smp_rmb(); /* make sure interrupts_masked is read */
WRITE_ONCE(ena_napi->interrupts_masked, false); /* We apply adaptive moderation on Rx path only. * Tx uses static interrupt moderation.
*/ if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
ena_adjust_adaptive_rx_intr_moderation(ena_napi);
/* Don't call the aenq handler before probe is done */ if (likely(test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)))
ena_com_aenq_intr_handler(adapter->ena_dev, data);
return IRQ_HANDLED;
}
/* ena_intr_msix_io - MSI-X Interrupt Handler for Tx/Rx * @irq: interrupt number * @data: pointer to a network interface private napi device structure
*/ static irqreturn_t ena_intr_msix_io(int irq, void *data)
{ struct ena_napi *ena_napi = data;
/* Used to check HW health */
WRITE_ONCE(ena_napi->first_interrupt, true);
WRITE_ONCE(ena_napi->interrupts_masked, true);
smp_wmb(); /* write interrupts_masked before calling napi */
napi_schedule_irqoff(&ena_napi->napi);
return IRQ_HANDLED;
}
/* Reserve a single MSI-X vector for management (admin + aenq). * plus reserve one vector for each potential io queue. * the number of potential io queues is the minimum of what the device * supports and the number of vCPUs.
*/ staticint ena_enable_msix(struct ena_adapter *adapter)
{ int msix_vecs, irq_cnt;
if (test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
netif_err(adapter, probe, adapter->netdev, "Error, MSI-X is already enabled\n"); return -EPERM;
}
/* Reserved the max msix vectors we might need */
msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues);
netif_dbg(adapter, probe, adapter->netdev, "Trying to enable MSI-X, vectors %d\n", msix_vecs);
if (irq_cnt < 0) {
netif_err(adapter, probe, adapter->netdev, "Failed to enable MSI-X. irq_cnt %d\n", irq_cnt); return -ENOSPC;
}
if (irq_cnt != msix_vecs) {
netif_notice(adapter, probe, adapter->netdev, "Enable only %d MSI-X (out of %d), reduce the number of queues\n",
irq_cnt, msix_vecs);
adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
}
if (netif_enable_cpu_rmap(adapter->netdev, adapter->num_io_queues))
netif_warn(adapter, probe, adapter->netdev, "Failed to map IRQs to CPUs\n");
if (!test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
netif_err(adapter, ifup, adapter->netdev, "Failed to request I/O IRQ: MSI-X is not enabled\n"); return -EINVAL;
}
for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
irq = &adapter->irq_tbl[i];
rc = request_irq(irq->vector, irq->handler, flags, irq->name,
irq->data); if (rc) {
netif_err(adapter, ifup, adapter->netdev, "Failed to request I/O IRQ. index %d rc %d\n",
i, rc); goto err;
}
netif_dbg(adapter, ifup, adapter->netdev, "Set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
i, irq->affinity_hint_mask.bits[0], irq->vector);
/* Now that IO IRQs have been successfully allocated map them to the * corresponding IO NAPI instance. Note that the mgmnt IRQ does not * have a NAPI, so care must be taken to correctly map IRQs to NAPIs.
*/ for (i = 0; i < io_queue_count; i++) {
irq_idx = ENA_IO_IRQ_IDX(i);
irq = &adapter->irq_tbl[irq_idx];
netif_napi_set_irq(&adapter->ena_napi[i].napi, irq->vector);
}
return rc;
err: for (k = ENA_IO_IRQ_FIRST_IDX; k < i; k++) {
irq = &adapter->irq_tbl[k];
free_irq(irq->vector, irq->data);
}
staticvoid ena_init_napi_in_range(struct ena_adapter *adapter, int first_index, int count)
{ int (*napi_handler)(struct napi_struct *napi, int budget); int i;
for (i = first_index; i < first_index + count; i++) { struct ena_napi *napi = &adapter->ena_napi[i]; struct ena_ring *rx_ring, *tx_ring;
if (!ENA_IS_XDP_INDEX(adapter, i))
napi->rx_ring = rx_ring;
napi->tx_ring = tx_ring;
napi->qid = i;
}
}
staticvoid ena_napi_disable_in_range(struct ena_adapter *adapter, int first_index, int count)
{ struct napi_struct *napi; int i;
for (i = first_index; i < first_index + count; i++) {
napi = &adapter->ena_napi[i].napi; if (!ENA_IS_XDP_INDEX(adapter, i)) { /* This API is supported for non-XDP queues only */
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_TX, NULL);
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_RX, NULL);
}
napi_disable(napi);
}
}
staticvoid ena_napi_enable_in_range(struct ena_adapter *adapter, int first_index, int count)
{ struct napi_struct *napi; int i;
for (i = first_index; i < first_index + count; i++) {
napi = &adapter->ena_napi[i].napi;
napi_enable(napi); if (!ENA_IS_XDP_INDEX(adapter, i)) { /* This API is supported for non-XDP queues only */
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_RX, napi);
netif_queue_set_napi(adapter->netdev, i,
NETDEV_QUEUE_TYPE_TX, napi);
}
}
}
/* Configure the Rx forwarding */ staticint ena_rss_configure(struct ena_adapter *adapter)
{ struct ena_com_dev *ena_dev = adapter->ena_dev; int rc;
/* In case the RSS table wasn't initialized by probe */ if (!ena_dev->rss.tbl_log_size) {
rc = ena_rss_init_default(adapter); if (rc && (rc != -EOPNOTSUPP)) {
netif_err(adapter, ifup, adapter->netdev, "Failed to init RSS rc: %d\n", rc); return rc;
}
}
/* Set indirect table */
rc = ena_com_indirect_table_set(ena_dev); if (unlikely(rc && rc != -EOPNOTSUPP)) return rc;
/* Configure hash function (if supported) */
rc = ena_com_set_hash_function(ena_dev); if (unlikely(rc && (rc != -EOPNOTSUPP))) return rc;
int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter, int first_index, int count)
{ struct ena_com_dev *ena_dev = adapter->ena_dev; int rc, i;
for (i = first_index; i < first_index + count; i++) {
rc = ena_create_io_tx_queue(adapter, i); if (rc) goto create_err;
}
return 0;
create_err: while (i-- > first_index)
ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i));
for (i = 0; i < adapter->num_io_queues; i++) {
rc = ena_create_io_rx_queue(adapter, i); if (rc) goto create_err;
INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
create_err: while (i--) {
ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]);
cancel_work_sync(&adapter->ena_napi[i].dim.work);
ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i));
}
return rc;
}
staticvoid set_io_rings_size(struct ena_adapter *adapter, int new_tx_size, int new_rx_size)
{ int i;
for (i = 0; i < adapter->num_io_queues; i++) {
adapter->tx_ring[i].ring_size = new_tx_size;
adapter->rx_ring[i].ring_size = new_rx_size;
}
}
/* This function allows queue allocation to backoff when the system is * low on memory. If there is not enough memory to allocate io queues * the driver will try to allocate smaller queues. * * The backoff algorithm is as follows: * 1. Try to allocate TX and RX and if successful. * 1.1. return success * * 2. Divide by 2 the size of the larger of RX and TX queues (or both if their size is the same). * * 3. If TX or RX is smaller than 256 * 3.1. return failure. * 4. else * 4.1. go back to 1.
*/ staticint create_queues_with_size_backoff(struct ena_adapter *adapter)
{ int rc, cur_rx_ring_size, cur_tx_ring_size; int new_rx_ring_size, new_tx_ring_size;
/* current queue sizes might be set to smaller than the requested * ones due to past queue allocation failures.
*/
set_io_rings_size(adapter, adapter->requested_tx_ring_size,
adapter->requested_rx_ring_size);
while (1) { if (ena_xdp_present(adapter)) {
rc = ena_setup_and_create_all_xdp_queues(adapter);
if (rc) goto err_setup_tx;
}
rc = ena_setup_tx_resources_in_range(adapter,
0,
adapter->num_io_queues); if (rc) goto err_setup_tx;
rc = ena_create_io_tx_queues_in_range(adapter,
0,
adapter->num_io_queues); if (rc) goto err_create_tx_queues;
rc = ena_setup_all_rx_resources(adapter); if (rc) goto err_setup_rx;
rc = ena_create_all_io_rx_queues(adapter); if (rc) goto err_create_rx_queues;
/* Decrease the size of the larger queue, or * decrease both if they are the same size.
*/ if (cur_rx_ring_size <= cur_tx_ring_size)
new_tx_ring_size = cur_tx_ring_size / 2; if (cur_rx_ring_size >= cur_tx_ring_size)
new_rx_ring_size = cur_rx_ring_size / 2;
if (new_tx_ring_size < ENA_MIN_RING_SIZE ||
new_rx_ring_size < ENA_MIN_RING_SIZE) {
netif_err(adapter, ifup, adapter->netdev, "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n",
ENA_MIN_RING_SIZE); return rc;
}
/* napi poll functions should be initialized before running * request_irq(), to handle a rare condition where there is a pending * interrupt, causing the ISR to fire immediately while the poll * function wasn't set yet, causing a null dereference
*/
ena_init_napi_in_range(adapter, 0, io_queue_count);
/* Enabling DIM needs to happen before enabling IRQs since DIM * is run from napi routine
*/ if (ena_com_interrupt_moderation_supported(adapter->ena_dev))
ena_com_enable_adaptive_moderation(adapter->ena_dev);
rc = ena_request_io_irq(adapter); if (rc) goto err_req_irq;
rc = create_queues_with_size_backoff(adapter); if (rc) goto err_create_queues_with_backoff;
rc = ena_up_complete(adapter); if (rc) goto err_up;
if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
netif_carrier_on(adapter->netdev);
/* Enable completion queues interrupt */ for (i = 0; i < adapter->num_io_queues; i++)
ena_unmask_interrupt(&adapter->tx_ring[i],
&adapter->rx_ring[i]);
/* schedule napi in case we had pending packets * from the last time we disable napi
*/ for (i = 0; i < io_queue_count; i++)
napi_schedule(&adapter->ena_napi[i].napi);
/* ena_open - Called when a network interface is made active * @netdev: network interface device structure * * Returns 0 on success, negative value on failure * * The open entry point is called when a network interface is made * active by the system (IFF_UP). At this point all resources needed * for transmit and receive operations are allocated, the interrupt * handler is registered with the OS, the watchdog timer is started, * and the stack is notified that the interface is ready.
*/ staticint ena_open(struct net_device *netdev)
{ struct ena_adapter *adapter = netdev_priv(netdev); int rc;
/* Notify the stack of the actual queue counts. */
rc = netif_set_real_num_tx_queues(netdev, adapter->num_io_queues); if (rc) {
netif_err(adapter, ifup, netdev, "Can't set num tx queues\n"); return rc;
}
rc = netif_set_real_num_rx_queues(netdev, adapter->num_io_queues); if (rc) {
netif_err(adapter, ifup, netdev, "Can't set num rx queues\n"); return rc;
}
rc = ena_up(adapter); if (rc) return rc;
return rc;
}
/* ena_close - Disables a network interface * @netdev: network interface device structure * * Returns 0, this is not allowed to fail * * The close entry point is called when an interface is de-activated * by the OS. The hardware is still under the drivers control, but * needs to be disabled. A global MAC reset is issued to stop the * hardware, and all transmit and receive resources are freed.
*/ staticint ena_close(struct net_device *netdev)
{ struct ena_adapter *adapter = netdev_priv(netdev);
/* a check that the configuration is valid is done by caller */ if (large_llq_changed) {
adapter->large_llq_header_enabled = !adapter->large_llq_header_enabled;
/* We need to destroy the rss table so that the indirection * table will be reinitialized by ena_up()
*/
ena_com_rss_destroy(ena_dev);
ena_init_io_rings(adapter,
0,
adapter->xdp_num_queues +
adapter->num_io_queues); return dev_was_up ? ena_open(adapter->netdev) : 0;
}
if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { /* When the device is LLQ mode, the driver will copy * the header into the device memory space. * the ena_com layer assume the header is in a linear * memory space. * This assumption might be wrong since part of the header * can be in the fragmented buffers. * Use skb_header_pointer to make sure the header is in a * linear memory space.
*/
netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb); /* Determine which tx ring we will be placed on */
qid = skb_get_queue_mapping(skb);
tx_ring = &adapter->tx_ring[qid];
txq = netdev_get_tx_queue(dev, qid);
rc = ena_check_and_linearize_skb(tx_ring, skb); if (unlikely(rc)) goto error_drop_packet;
/* stop the queue when no more space available, the packet can have up * to sgl_size + 2. one for the meta descriptor and one for header * (if the header is larger than tx_max_header_size).
*/ if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
tx_ring->sgl_size + 2))) {
netif_dbg(adapter, tx_queued, dev, "%s stop queue %d\n",
__func__, qid);
/* There is a rare condition where this function decide to * stop the queue but meanwhile clean_tx_irq updates * next_to_completion and terminates. * The queue will remain stopped forever. * To solve this issue add a mb() to make sure that * netif_tx_stop_queue() write is vissible before checking if * there is additional space in the queue.
*/
smp_mb();
if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
ENA_TX_WAKEUP_THRESH)) {
netif_tx_wake_queue(txq);
ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1,
&tx_ring->syncp);
}
}
skb_tx_timestamp(skb);
if (netif_xmit_stopped(txq) || !netdev_xmit_more()) /* trigger the dma engine. ena_ring_tx_doorbell() * calls a memory barrier inside it.
*/
ena_ring_tx_doorbell(tx_ring);
/* Allocate only the host info */
rc = ena_com_allocate_host_info(ena_dev); if (rc) {
dev_err(dev, "Cannot allocate host info\n"); return;
}
host_info = ena_dev->host_attr.host_info;
host_info->bdf = pci_dev_id(pdev);
host_info->os_type = ENA_ADMIN_OS_LINUX;
host_info->kernel_ver = LINUX_VERSION_CODE;
ret = strscpy(host_info->kernel_ver_str, utsname()->version, sizeof(host_info->kernel_ver_str)); if (ret < 0)
dev_dbg(dev, "kernel version string will be truncated, status = %zd\n", ret);
host_info->os_dist = 0;
ret = strscpy(host_info->os_dist_str, utsname()->release, sizeof(host_info->os_dist_str)); if (ret < 0)
dev_dbg(dev, "OS distribution string will be truncated, status = %zd\n", ret);
/* If this function is called after driver load, the ring sizes have already * been configured. Take it into account when recalculating ring size.
*/ if (adapter->tx_ring->ring_size)
tx_queue_size = adapter->tx_ring->ring_size;
if (adapter->rx_ring->ring_size)
rx_queue_size = adapter->rx_ring->ring_size;
if (max_tx_queue_size < ENA_MIN_RING_SIZE) {
netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n",
max_tx_queue_size, ENA_MIN_RING_SIZE); return -EINVAL;
}
if (max_rx_queue_size < ENA_MIN_RING_SIZE) {
netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n",
max_rx_queue_size, ENA_MIN_RING_SIZE); return -EINVAL;
}
/* When forcing large headers, we multiply the entry size by 2, and therefore divide * the queue size by 2, leaving the amount of memory used by the queues unchanged.
*/ if (adapter->large_llq_header_enabled) { if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
max_tx_queue_size /= 2;
dev_info(&adapter->pdev->dev, "Forcing large headers and decreasing maximum TX queue size to %d\n",
max_tx_queue_size);
} else {
dev_err(&adapter->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
rc = ether_addr_equal(get_feat_ctx->dev_attr.mac_addr,
adapter->mac_addr); if (!rc) {
netif_err(adapter, drv, netdev, "Error, mac address are different\n"); return -EINVAL;
}
if (get_feat_ctx->dev_attr.max_mtu < netdev->mtu) {
netif_err(adapter, drv, netdev, "Error, device max mtu is smaller than netdev MTU\n"); return -EINVAL;
}
llq_feature_mask = 1 << ENA_ADMIN_LLQ; if (!(ena_dev->supported_features & llq_feature_mask)) {
dev_warn(&pdev->dev, "LLQ is not supported Fallback to host mode policy.\n");
ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; return 0;
}
if (!ena_dev->mem_bar) {
netdev_err(ena_dev->net_device, "LLQ is advertised as supported but device doesn't expose mem bar\n");
ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; return 0;
}
rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations); if (unlikely(rc)) {
dev_err(&pdev->dev, "Failed to configure the device mode. Fallback to host mode policy.\n");
ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
}
rc = ena_com_mmio_reg_read_request_init(ena_dev); if (rc) {
dev_err(dev, "Failed to init mmio read less\n"); return rc;
}
/* The PCIe configuration space revision id indicate if mmio reg * read is disabled
*/
readless_supported = !(pdev->revision & ENA_MMIO_DISABLE_REG_READ);
ena_com_set_mmio_read_mode(ena_dev, readless_supported);
rc = ena_com_dev_reset(ena_dev, ENA_REGS_RESET_NORMAL); if (rc) {
dev_err(dev, "Can not reset device\n"); goto err_mmio_read_less;
}
rc = ena_com_validate_version(ena_dev); if (rc) {
dev_err(dev, "Device version is too low\n"); goto err_mmio_read_less;
}
/* ENA admin level init */
rc = ena_com_admin_init(ena_dev, &aenq_handlers); if (rc) {
dev_err(dev, "Can not initialize ena admin queue with device\n"); goto err_mmio_read_less;
}
/* To enable the msix interrupts the driver needs to know the number * of queues. So the driver uses polling mode to retrieve this * information
*/
ena_com_set_admin_polling_mode(ena_dev, true);
ena_config_host_info(ena_dev, pdev);
/* Get Device Attributes*/
rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx); if (rc) {
dev_err(dev, "Cannot get attribute for ena device rc=%d\n", rc); goto err_admin_init;
}
/* Try to turn all the available aenq groups */
aenq_groups = BIT(ENA_ADMIN_LINK_CHANGE) |
BIT(ENA_ADMIN_FATAL_ERROR) |
BIT(ENA_ADMIN_WARNING) |
BIT(ENA_ADMIN_NOTIFICATION) |
BIT(ENA_ADMIN_KEEP_ALIVE);
/* Stop the device from sending AENQ events (in case reset flag is set * and device is up, ena_down() already reset the device.
*/ if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))
rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
ena_free_mgmnt_irq(adapter);
ena_disable_msix(adapter);
ena_com_abort_admin_commands(ena_dev);
ena_com_wait_for_abort_completion(ena_dev);
ena_com_admin_destroy(ena_dev);
ena_phc_destroy(adapter);
ena_com_mmio_reg_read_request_destroy(ena_dev);
/* return reset reason to default value */
adapter->reset_reason = ENA_REGS_RESET_NORMAL;
count = adapter->xdp_num_queues + adapter->num_io_queues; for (i = 0 ; i < count; i++) {
txr = &adapter->tx_ring[i];
txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
txr->tx_max_header_size = ena_dev->tx_max_header_size;
}
rc = ena_device_validate_params(adapter, &get_feat_ctx); if (rc) {
dev_err(&pdev->dev, "Validation of device parameters failed\n"); goto err_device_destroy;
}
rc = ena_enable_msix_and_set_admin_interrupts(adapter); if (rc) {
dev_err(&pdev->dev, "Enable MSI-X failed\n"); goto err_device_destroy;
} /* If the interface was up before the reset bring it up */ if (adapter->dev_up_before_reset) {
rc = ena_up(adapter); if (rc) {
dev_err(&pdev->dev, "Failed to create I/O queues\n"); goto err_disable_msix;
}
}
if (unlikely(!READ_ONCE(ena_napi->first_interrupt) && is_tx_comp_time_expired)) { /* If after graceful period interrupt is still not * received, we schedule a reset
*/
netif_err(adapter, tx_err, adapter->netdev, "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
tx_ring->qid);
ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT); return -EIO;
}
if (missing_tx_comp_to < time_since_last_napi && napi_scheduled) { /* We suspect napi isn't called because the * bottom half is not run. Require a bigger * timeout for these cases
*/ if (!time_is_before_jiffies(last_jiffies +
2 * adapter->missing_tx_completion_to)) continue;
netif_notice(adapter, tx_err, adapter->netdev, "TX hasn't completed, qid %d, index %d. %u usecs from last napi execution, napi scheduled: %d\n",
tx_ring->qid, i, time_since_last_napi, napi_scheduled);
tx_buf->print_once = 1;
}
}
if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) {
netif_err(adapter, tx_err, adapter->netdev, "Lost TX completions are above the threshold (%d > %d). Completion transmission timeout: %u.\n",
missed_tx,
adapter->missing_tx_completion_threshold,
missing_tx_comp_to);
netif_err(adapter, tx_err, adapter->netdev, "Resetting the device\n");
/* trigger napi schedule after 2 consecutive detections */ #define EMPTY_RX_REFILL 2 /* For the rare case where the device runs out of Rx descriptors and the * napi handler failed to refill new Rx descriptors (due to a lack of memory * for example). * This case will lead to a deadlock: * The device won't send interrupts since all the new Rx packets will be dropped * The napi handler won't allocate new Rx descriptors so the device will be * able to send new packets. * * This scenario can happen when the kernel's vm.min_free_kbytes is too small. * It is recommended to have at least 512MB, with a minimum of 128MB for * constrained environment). * * When such a situation is detected - Reschedule napi
*/ staticvoid check_for_empty_rx_ring(struct ena_adapter *adapter)
{ struct ena_ring *rx_ring; int i, refill_required;
if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) return;
if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) return;
for (i = 0; i < adapter->num_io_queues; i++) {
rx_ring = &adapter->rx_ring[i];
if (hints->admin_completion_tx_timeout)
adapter->ena_dev->admin_queue.completion_timeout =
hints->admin_completion_tx_timeout * 1000;
if (hints->mmio_read_timeout) /* convert to usec */
adapter->ena_dev->mmio_read.reg_read_to =
hints->mmio_read_timeout * 1000;
if (hints->missed_tx_completion_count_threshold_to_reset)
adapter->missing_tx_completion_threshold =
hints->missed_tx_completion_count_threshold_to_reset;
if (hints->missing_tx_completion_timeout) { if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT)
adapter->missing_tx_completion_to = ENA_HW_HINTS_NO_TIMEOUT; else
adapter->missing_tx_completion_to =
msecs_to_jiffies(hints->missing_tx_completion_timeout);
}
if (hints->netdev_wd_timeout)
netdev->watchdog_timeo = msecs_to_jiffies(hints->netdev_wd_timeout);
if (hints->driver_watchdog_timeout) { if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT)
adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT; else
adapter->keep_alive_timeout =
msecs_to_jiffies(hints->driver_watchdog_timeout);
}
}
/* In case of LLQ use the llq fields for the tx SQ/CQ */ if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
io_tx_sq_num = get_feat_ctx->llq.max_llq_num;
max_num_io_queues = min_t(u32, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
max_num_io_queues = min_t(u32, max_num_io_queues, io_rx_num);
max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_sq_num);
max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_cq_num); /* 1 IRQ for mgmnt and 1 IRQs for each IO direction */
max_num_io_queues = min_t(u32, max_num_io_queues, pci_msix_vec_count(pdev) - 1);
for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
val = ethtool_rxfh_indir_default(i, adapter->num_io_queues);
rc = ena_com_indirect_table_fill_entry(ena_dev, i,
ENA_IO_RXQ_IDX(val)); if (unlikely(rc)) {
dev_err(dev, "Cannot fill indirect table\n"); goto err_fill_indir;
}
}
rc = ena_map_llq_mem_bar(pdev, ena_dev, bars); if (rc) {
dev_err(&pdev->dev, "ENA LLQ bar mapping failed\n"); goto err_metrics_destroy;
}
/* Need to do this before ena_device_init */
devlink = ena_devlink_alloc(adapter); if (!devlink) {
netdev_err(netdev, "ena_devlink_alloc failed\n");
rc = -ENOMEM; goto err_metrics_destroy;
}
/* From this point, the devlink device is visible to users. * Perform the registration last to ensure that all the resources * are available and that the netdevice is registered.
*/
ena_devlink_register(devlink, &pdev->dev);
/* __ena_shutoff - Helper used in both PCI remove/shutdown routines * @pdev: PCI device information struct * @shutdown: Is it a shutdown operation? If false, means it is a removal * * __ena_shutoff is a helper routine that does the real work on shutdown and * removal paths; the difference between those paths is with regards to whether * dettach or unregister the netdevice.
*/ staticvoid __ena_shutoff(struct pci_dev *pdev, bool shutdown)
{ struct ena_adapter *adapter = pci_get_drvdata(pdev); struct ena_com_dev *ena_dev; struct net_device *netdev;
/* Make sure timer and reset routine won't be called after * freeing device resources.
*/
timer_delete_sync(&adapter->timer_service);
cancel_work_sync(&adapter->reset_task);
rtnl_lock(); /* lock released inside the below if-else block */
adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN;
ena_destroy_device(adapter, true);
/* ena_remove - Device Removal Routine * @pdev: PCI device information struct * * ena_remove is called by the PCI subsystem to alert the driver * that it should release a PCI device.
*/
/* ena_shutdown - Device Shutdown Routine * @pdev: PCI device information struct * * ena_shutdown is called by the PCI subsystem to alert the driver that * a shutdown/reboot (or kexec) is happening and device must be disabled.
*/
rtnl_lock(); if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
dev_err(&pdev->dev, "Ignoring device reset request as the device is being suspended\n");
clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
}
ena_destroy_device(adapter, true);
rtnl_unlock(); return 0;
}
u64_stats_update_begin(&adapter->syncp); /* These stats are accumulated by the device, so the counters indicate * all drops since last reset.
*/
adapter->dev_stats.rx_drops = rx_drops;
adapter->dev_stats.tx_drops = tx_drops;
u64_stats_update_end(&adapter->syncp);
}
switch (aenq_e->aenq_common_desc.syndrome) { case ENA_ADMIN_UPDATE_HINTS:
hints = (struct ena_admin_ena_hw_hints *)
(&aenq_e->inline_data_w4);
ena_update_hints(adapter, hints); break; default:
netif_err(adapter, drv, adapter->netdev, "Invalid aenq notification link state %d\n",
aenq_e->aenq_common_desc.syndrome);
}
}
/* This handler will called for unknown event group or unimplemented handlers*/ staticvoid unimplemented_aenq_handler(void *data, struct ena_admin_aenq_entry *aenq_e)
{ struct ena_adapter *adapter = (struct ena_adapter *)data;
netif_err(adapter, drv, adapter->netdev, "Unknown event was received or event with unimplemented handler\n");
}
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.66Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-28)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.