if (block->tx) { if (block->tx->q_num < priv->tx_cfg.num_queues)
reschedule |= gve_tx_poll(block, budget); elseif (budget)
reschedule |= gve_xdp_poll(block, budget);
}
if (!budget) return 0;
if (block->rx) {
work_done = gve_rx_poll(block, budget);
/* Poll XSK TX as part of RX NAPI. Setup re-poll based on max of * TX and RX work done.
*/ if (priv->xdp_prog)
work_done = max_t(int, work_done,
gve_xsk_tx_poll(block, budget));
reschedule |= work_done == budget;
}
if (reschedule) return budget;
/* Complete processing - don't unmask irq if busy polling is enabled */ if (likely(napi_complete_done(napi, work_done))) {
irq_doorbell = gve_irq_doorbell(priv, block);
iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell);
/* Ensure IRQ ACK is visible before we check pending work. * If queue had issued updates, it would be truly visible.
*/
mb();
if (block->tx)
reschedule |= gve_tx_clean_pending(priv, block->tx); if (block->rx)
reschedule |= gve_rx_work_pending(block->rx);
if (reschedule && napi_schedule(napi))
iowrite32be(GVE_IRQ_MASK, irq_doorbell);
} return work_done;
}
int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
{ struct gve_notify_block *block =
container_of(napi, struct gve_notify_block, napi); struct gve_priv *priv = block->priv; bool reschedule = false; int work_done = 0;
if (block->tx) { if (block->tx->q_num < priv->tx_cfg.num_queues)
reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true); else
reschedule |= gve_xdp_poll_dqo(block);
}
if (!budget) return 0;
if (block->rx) {
work_done = gve_rx_poll_dqo(block, budget);
/* Poll XSK TX as part of RX NAPI. Setup re-poll based on if * either datapath has more work to do.
*/ if (priv->xdp_prog)
reschedule |= gve_xsk_tx_poll_dqo(block, budget);
reschedule |= work_done == budget;
}
if (reschedule) { /* Reschedule by returning budget only if already on the correct * cpu.
*/ if (likely(gve_is_napi_on_home_cpu(priv, block->irq))) return budget;
/* If not on the cpu with which this queue's irq has affinity * with, we avoid rescheduling napi and arm the irq instead so * that napi gets rescheduled back eventually onto the right * cpu.
*/ if (work_done == budget)
work_done--;
}
if (likely(napi_complete_done(napi, work_done))) { /* Enable interrupts again. * * We don't need to repoll afterwards because HW supports the * PCI MSI-X PBA feature. * * Another interrupt would be triggered if a new event came in * since the last one.
*/
gve_write_irq_doorbell_dqo(priv, block,
GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
}
staticint gve_alloc_notify_blocks(struct gve_priv *priv)
{ int num_vecs_requested = priv->num_ntfy_blks + 1; conststruct cpumask *node_mask; unsignedint cur_cpu; int vecs_enabled; int i, j; int err;
priv->msix_vectors = kvcalloc(num_vecs_requested, sizeof(*priv->msix_vectors), GFP_KERNEL); if (!priv->msix_vectors) return -ENOMEM; for (i = 0; i < num_vecs_requested; i++)
priv->msix_vectors[i].entry = i;
vecs_enabled = pci_enable_msix_range(priv->pdev, priv->msix_vectors,
GVE_MIN_MSIX, num_vecs_requested); if (vecs_enabled < 0) {
dev_err(&priv->pdev->dev, "Could not enable min msix %d/%d\n",
GVE_MIN_MSIX, vecs_enabled);
err = vecs_enabled; goto abort_with_msix_vectors;
} if (vecs_enabled != num_vecs_requested) { int new_num_ntfy_blks = (vecs_enabled - 1) & ~0x1; int vecs_per_type = new_num_ntfy_blks / 2; int vecs_left = new_num_ntfy_blks % 2;
priv->num_ntfy_blks = new_num_ntfy_blks;
priv->mgmt_msix_idx = priv->num_ntfy_blks;
priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues,
vecs_per_type);
priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues,
vecs_per_type + vecs_left);
dev_err(&priv->pdev->dev, "Could not enable desired msix, only enabled %d, adjusting tx max queues to %d, and rx max queues to %d\n",
vecs_enabled, priv->tx_cfg.max_queues,
priv->rx_cfg.max_queues); if (priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)
priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues)
priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
}
/* Setup Management Vector - the last vector */
snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "gve-mgmnt@pci:%s",
pci_name(priv->pdev));
err = request_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector,
gve_mgmnt_intr, 0, priv->mgmt_msix_name, priv); if (err) {
dev_err(&priv->pdev->dev, "Did not receive management vector.\n"); goto abort_with_msix_enabled;
}
priv->irq_db_indices =
dma_alloc_coherent(&priv->pdev->dev,
priv->num_ntfy_blks * sizeof(*priv->irq_db_indices),
&priv->irq_db_indices_bus, GFP_KERNEL); if (!priv->irq_db_indices) {
err = -ENOMEM; goto abort_with_mgmt_vector;
}
/* Setup the other blocks - the first n-1 vectors */
node_mask = gve_get_node_mask(priv);
cur_cpu = cpumask_first(node_mask); for (i = 0; i < priv->num_ntfy_blks; i++) { struct gve_notify_block *block = &priv->ntfy_blocks[i]; int msix_idx = i;
cur_cpu = cpumask_next(cur_cpu, node_mask); /* Wrap once CPUs in the node have been exhausted, or when * starting RX queue affinities. TX and RX queues of the same * index share affinity.
*/ if (cur_cpu >= nr_cpu_ids || (i + 1) == priv->tx_cfg.max_queues)
cur_cpu = cpumask_first(node_mask);
} return 0;
abort_with_some_ntfy_blocks: for (j = 0; j < i; j++) { struct gve_notify_block *block = &priv->ntfy_blocks[j]; int msix_idx = j;
for (i = 0; i < num_tx_qpls; i++) {
err = gve_unregister_qpl(priv, gve_tx_get_qpl(priv, i)); /* This failure will trigger a reset - no need to clean */ if (err) return err;
}
for (i = 0; i < num_rx_qpls; i++) {
err = gve_unregister_qpl(priv, gve_rx_get_qpl(priv, i)); /* This failure will trigger a reset - no need to clean */ if (err) return err;
} return 0;
}
staticint gve_create_rings(struct gve_priv *priv)
{ int num_tx_queues = gve_num_tx_queues(priv); int err; int i;
err = gve_adminq_create_tx_queues(priv, 0, num_tx_queues); if (err) {
netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n",
num_tx_queues); /* This failure will trigger a reset - no need to clean * up
*/ return err;
}
netif_dbg(priv, drv, priv->dev, "created %d tx queues\n",
num_tx_queues);
err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); if (err) {
netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n",
priv->rx_cfg.num_queues); /* This failure will trigger a reset - no need to clean * up
*/ return err;
}
netif_dbg(priv, drv, priv->dev, "created %d rx queues\n",
priv->rx_cfg.num_queues);
if (gve_is_gqi(priv)) { /* Rx data ring has been prefilled with packet buffers at queue * allocation time. * * Write the doorbell to provide descriptor slots and packet * buffers to the NIC.
*/ for (i = 0; i < priv->rx_cfg.num_queues; i++)
gve_rx_write_doorbell(priv, &priv->rx[i]);
} else { for (i = 0; i < priv->rx_cfg.num_queues; i++) { /* Post buffers and ring doorbell. */
gve_rx_post_buffers_dqo(&priv->rx[i]);
}
}
return 0;
}
staticvoid init_xdp_sync_stats(struct gve_priv *priv)
{ int start_id = gve_xdp_tx_start_queue_id(priv); int i;
/* Init stats */ for (i = start_id; i < start_id + priv->tx_cfg.num_xdp_queues; i++) { int ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
staticint gve_destroy_rings(struct gve_priv *priv)
{ int num_tx_queues = gve_num_tx_queues(priv); int err;
err = gve_adminq_destroy_tx_queues(priv, 0, num_tx_queues); if (err) {
netif_err(priv, drv, priv->dev, "failed to destroy tx queues\n"); /* This failure will trigger a reset - no need to clean up */ return err;
}
netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n");
err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); if (err) {
netif_err(priv, drv, priv->dev, "failed to destroy rx queues\n"); /* This failure will trigger a reset - no need to clean up */ return err;
}
netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n"); return 0;
}
/* Use this to schedule a reset when the device is capable of continuing * to handle other requests in its current state. If it is not, do a reset * in thread instead.
*/ void gve_schedule_reset(struct gve_priv *priv)
{
gve_set_do_reset(priv);
queue_work(priv->gve_wq, &priv->service_task);
}
/* The passed-in queue memory is stored into priv and the queues are made live. * No memory is allocated. Passed-in memory is freed on errors.
*/ staticint gve_queues_start(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *tx_alloc_cfg, struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
{ struct net_device *dev = priv->dev; int err;
/* Record new resources into priv */
priv->tx = tx_alloc_cfg->tx;
priv->rx = rx_alloc_cfg->rx;
/* Record new configs into priv */
priv->tx_cfg = *tx_alloc_cfg->qcfg;
priv->tx_cfg.num_xdp_queues = tx_alloc_cfg->num_xdp_rings;
priv->rx_cfg = *rx_alloc_cfg->qcfg_rx;
priv->tx_desc_cnt = tx_alloc_cfg->ring_size;
priv->rx_desc_cnt = rx_alloc_cfg->ring_size;
reset: if (gve_get_reset_in_progress(priv)) goto stop_and_free_rings;
gve_reset_and_teardown(priv, true); /* if this fails there is nothing we can do so just ignore the return */
gve_reset_recovery(priv, false); /* return the original error */ return err;
stop_and_free_rings:
gve_tx_stop_rings(priv, gve_num_tx_queues(priv));
gve_rx_stop_rings(priv, priv->rx_cfg.num_queues);
gve_queues_mem_remove(priv); return err;
}
err = gve_queues_mem_alloc(priv, &tx_alloc_cfg, &rx_alloc_cfg); if (err) return err;
/* No need to free on error: ownership of resources is lost after * calling gve_queues_start.
*/
err = gve_queues_start(priv, &tx_alloc_cfg, &rx_alloc_cfg); if (err) return err;
return 0;
}
staticint gve_queues_stop(struct gve_priv *priv)
{ int err;
netif_carrier_off(priv->dev); if (gve_get_device_rings_ok(priv)) {
gve_turndown(priv);
gve_drain_page_cache(priv);
err = gve_destroy_rings(priv); if (err) goto err;
err = gve_unregister_qpls(priv); if (err) goto err;
gve_clear_device_rings_ok(priv);
}
timer_delete_sync(&priv->stats_report_timer);
err: /* This must have been called from a reset due to the rtnl lock * so just return at this point.
*/ if (gve_get_reset_in_progress(priv)) return err; /* Otherwise reset before returning */
gve_reset_and_teardown(priv, true); return gve_reset_recovery(priv, false);
}
staticvoid gve_handle_link_status(struct gve_priv *priv, bool link_status)
{ if (!gve_get_napi_enabled(priv)) return;
if (link_status == netif_carrier_ok(priv->dev)) return;
if (link_status) {
netdev_info(priv->dev, "Device link is up.\n");
netif_carrier_on(priv->dev);
} else {
netdev_info(priv->dev, "Device link is down.\n");
netif_carrier_off(priv->dev);
}
}
if (dev->mtu > max_xdp_mtu) {
netdev_warn(dev, "XDP is not supported for mtu %d.\n",
dev->mtu); return -EOPNOTSUPP;
}
if (priv->rx_cfg.num_queues != priv->tx_cfg.num_queues ||
(2 * priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)) {
netdev_warn(dev, "XDP load failed: The number of configured RX queues %d should be equal to the number of configured TX queues %d and the number of configured RX/TX queues should be less than or equal to half the maximum number of RX/TX queues %d",
priv->rx_cfg.num_queues,
priv->tx_cfg.num_queues,
priv->tx_cfg.max_queues); return -EINVAL;
} return 0;
}
int gve_flow_rules_reset(struct gve_priv *priv)
{ if (!priv->max_flow_rules) return 0;
return gve_adminq_reset_flow_rules(priv);
}
int gve_adjust_config(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *tx_alloc_cfg, struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
{ int err;
/* Allocate resources for the new configuration */
err = gve_queues_mem_alloc(priv, tx_alloc_cfg, rx_alloc_cfg); if (err) {
netif_err(priv, drv, priv->dev, "Adjust config failed to alloc new queues"); return err;
}
/* Teardown the device and free existing resources */
err = gve_close(priv->dev); if (err) {
netif_err(priv, drv, priv->dev, "Adjust config failed to close old queues");
gve_queues_mem_free(priv, tx_alloc_cfg, rx_alloc_cfg); return err;
}
/* Bring the device back up again with the new resources. */
err = gve_queues_start(priv, tx_alloc_cfg, rx_alloc_cfg); if (err) {
netif_err(priv, drv, priv->dev, "Adjust config failed to start new queues, !!! DISABLING ALL QUEUES !!!\n"); /* No need to free on error: ownership of resources is lost after * calling gve_queues_start.
*/
gve_turndown(priv); return err;
}
/* Relay the new config from ethtool */
tx_alloc_cfg.qcfg = &new_tx_config;
rx_alloc_cfg.qcfg_tx = &new_tx_config;
rx_alloc_cfg.qcfg_rx = &new_rx_config;
rx_alloc_cfg.reset_rss = reset_rss;
if (netif_running(priv->dev)) {
err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg); return err;
} /* Set the config for the next up. */ if (reset_rss) {
err = gve_init_rss_config(priv, new_rx_config.num_queues); if (err) return err;
}
priv->tx_cfg = new_tx_config;
priv->rx_cfg = new_rx_config;
return 0;
}
staticvoid gve_turndown(struct gve_priv *priv)
{ int idx;
if (netif_carrier_ok(priv->dev))
netif_carrier_off(priv->dev);
if (!gve_get_napi_enabled(priv)) return;
/* Disable napi to prevent more work from coming in */ for (idx = 0; idx < gve_num_tx_queues(priv); idx++) { int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
if (!gve_tx_was_added_to_block(priv, idx)) continue;
if (idx < priv->tx_cfg.num_queues)
netif_queue_set_napi(priv->dev, idx,
NETDEV_QUEUE_TYPE_TX, NULL);
/* Make sure that all traffic is finished processing. */
synchronize_net();
}
staticvoid gve_turnup(struct gve_priv *priv)
{ int idx;
/* Start the tx queues */
netif_tx_start_all_queues(priv->dev);
/* Enable napi and unmask interrupts for all queues */ for (idx = 0; idx < gve_num_tx_queues(priv); idx++) { int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
if (!gve_tx_was_added_to_block(priv, idx)) continue;
napi_enable_locked(&block->napi);
if (idx < priv->tx_cfg.num_queues)
netif_queue_set_napi(priv->dev, idx,
NETDEV_QUEUE_TYPE_TX,
&block->napi);
/* Any descs written by the NIC before this barrier will be * handled by the one-off napi schedule below. Whereas any * descs after the barrier will generate interrupts.
*/
mb();
napi_schedule(&block->napi);
} for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
if (!gve_rx_was_added_to_block(priv, idx)) continue;
/* Any descs written by the NIC before this barrier will be * handled by the one-off napi schedule below. Whereas any * descs after the barrier will generate interrupts.
*/
mb();
napi_schedule(&block->napi);
}
if (priv->tx_cfg.num_xdp_queues && gve_supports_xdp_xmit(priv))
xdp_features_set_redirect_target_locked(priv->dev, false);
if (kernel_config->tx_type != HWTSTAMP_TX_OFF) {
NL_SET_ERR_MSG_MOD(extack, "TX timestamping is not supported"); return -ERANGE;
}
if (kernel_config->rx_filter != HWTSTAMP_FILTER_NONE) { if (!priv->nic_ts_report) {
NL_SET_ERR_MSG_MOD(extack, "RX timestamping is not supported");
kernel_config->rx_filter = HWTSTAMP_FILTER_NONE; return -EOPNOTSUPP;
}
staticvoid gve_handle_reset(struct gve_priv *priv)
{ /* A service task will be scheduled at the end of probe to catch any * resets that need to happen, and we don't want to reset until * probe is done.
*/ if (gve_get_probe_in_progress(priv)) return;
staticint gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
{ int num_ntfy; int err;
/* Set up the adminq */
err = gve_adminq_alloc(&priv->pdev->dev, priv); if (err) {
dev_err(&priv->pdev->dev, "Failed to alloc admin queue: err=%d\n", err); return err;
}
err = gve_verify_driver_compatibility(priv); if (err) {
dev_err(&priv->pdev->dev, "Could not verify driver compatibility: err=%d\n", err); goto err;
}
priv->num_registered_pages = 0;
if (skip_describe_device) goto setup_device;
priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED; /* Get the initial information we need from the device */
err = gve_adminq_describe_device(priv); if (err) {
dev_err(&priv->pdev->dev, "Could not get device information: err=%d\n", err); goto err;
}
priv->dev->mtu = priv->dev->max_mtu;
num_ntfy = pci_msix_vec_count(priv->pdev); if (num_ntfy <= 0) {
dev_err(&priv->pdev->dev, "could not count MSI-x vectors: err=%d\n", num_ntfy);
err = num_ntfy; goto err;
} elseif (num_ntfy < GVE_MIN_MSIX) {
dev_err(&priv->pdev->dev, "gve needs at least %d MSI-x vectors, but only has %d\n",
GVE_MIN_MSIX, num_ntfy);
err = -EINVAL; goto err;
}
/* Big TCP is only supported on DQO */ if (!gve_is_gqi(priv))
netif_set_tso_max_size(priv->dev, GVE_DQO_TX_MAX);
priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; /* gvnic has one Notification Block per MSI-x vector, except for the * management vector
*/
priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1;
priv->mgmt_msix_idx = priv->num_ntfy_blks;
priv->numa_node = dev_to_node(&priv->pdev->dev);
staticvoid gve_trigger_reset(struct gve_priv *priv)
{ /* Reset the device by releasing the AQ */
gve_adminq_release(priv);
}
staticvoid gve_reset_and_teardown(struct gve_priv *priv, bool was_up)
{
gve_trigger_reset(priv); /* With the reset having already happened, close cannot fail */ if (was_up)
gve_close(priv->dev);
gve_teardown_priv_resources(priv);
}
staticint gve_reset_recovery(struct gve_priv *priv, bool was_up)
{ int err;
err = gve_init_priv(priv, true); if (err) goto err; if (was_up) {
err = gve_open(priv->dev); if (err) goto err;
} return 0;
err:
dev_err(&priv->pdev->dev, "Reset failed! !!! DISABLING ALL QUEUES !!!\n");
gve_turndown(priv); return err;
}
int gve_reset(struct gve_priv *priv, bool attempt_teardown)
{ bool was_up = netif_running(priv->dev); int err;
dev_info(&priv->pdev->dev, "Performing reset\n");
gve_clear_do_reset(priv);
gve_set_reset_in_progress(priv); /* If we aren't attempting to teardown normally, just go turndown and * reset right away.
*/ if (!attempt_teardown) {
gve_turndown(priv);
gve_reset_and_teardown(priv, was_up);
} else { /* Otherwise attempt to close normally */ if (was_up) {
err = gve_close(priv->dev); /* If that fails reset as we did above */ if (err)
gve_reset_and_teardown(priv, was_up);
} /* Clean up any remaining resources */
gve_teardown_priv_resources(priv);
}
/* Set it all back up */
err = gve_reset_recovery(priv, was_up);
gve_clear_reset_in_progress(priv);
priv->reset_cnt++;
priv->interface_up_cnt = 0;
priv->interface_down_cnt = 0;
priv->stats_report_trigger_cnt = 0; return err;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.