/** * iavf_is_descriptor_done - tests DD bit in Rx descriptor * @qw1: quad word 1 from descriptor to get Descriptor Done field from * @flex: is the descriptor flex or legacy * * This function tests the descriptor done bit in specified descriptor. Because * there are two types of descriptors (legacy and flex) the parameter rx_ring * is used to distinguish. * * Return: true or false based on the state of DD bit in Rx descriptor.
*/ staticbool iavf_is_descriptor_done(u64 qw1, bool flex)
{ if (flex) return FIELD_GET(IAVF_RXD_FLEX_DD_M, qw1); else return FIELD_GET(IAVF_RXD_LEGACY_DD_M, qw1);
}
/** * iavf_unmap_and_free_tx_resource - Release a Tx buffer * @ring: the ring that owns the buffer * @tx_buffer: the buffer to free
**/ staticvoid iavf_unmap_and_free_tx_resource(struct iavf_ring *ring, struct iavf_tx_buffer *tx_buffer)
{ if (tx_buffer->skb) { if (tx_buffer->tx_flags & IAVF_TX_FLAGS_FD_SB)
kfree(tx_buffer->raw_buf); else
dev_kfree_skb_any(tx_buffer->skb); if (dma_unmap_len(tx_buffer, len))
dma_unmap_single(ring->dev,
dma_unmap_addr(tx_buffer, dma),
dma_unmap_len(tx_buffer, len),
DMA_TO_DEVICE);
} elseif (dma_unmap_len(tx_buffer, len)) {
dma_unmap_page(ring->dev,
dma_unmap_addr(tx_buffer, dma),
dma_unmap_len(tx_buffer, len),
DMA_TO_DEVICE);
}
tx_buffer->next_to_watch = NULL;
tx_buffer->skb = NULL;
dma_unmap_len_set(tx_buffer, len, 0); /* tx_buffer must be completely set up in the transmit path */
}
/** * iavf_clean_tx_ring - Free any empty Tx buffers * @tx_ring: ring to be cleaned
**/ staticvoid iavf_clean_tx_ring(struct iavf_ring *tx_ring)
{ unsignedlong bi_size;
u16 i;
/* ring already cleared, nothing to do */ if (!tx_ring->tx_bi) return;
/* Free all the Tx ring sk_buffs */ for (i = 0; i < tx_ring->count; i++)
iavf_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
/** * iavf_get_tx_pending - how many Tx descriptors not processed * @ring: the ring of descriptors * @in_sw: is tx_pending being checked in SW or HW * * Since there is no access to the ring head register * in XL710, we need to use our local copies
**/ static u32 iavf_get_tx_pending(struct iavf_ring *ring, bool in_sw)
{
u32 head, tail;
/* underlying hardware might not allow access and/or always return * 0 for the head/tail registers so just use the cached values
*/
head = ring->next_to_clean;
tail = ring->next_to_use;
if (head != tail) return (head < tail) ?
tail - head : (tail + ring->count - head);
return 0;
}
/** * iavf_force_wb - Issue SW Interrupt so HW does a wb * @vsi: the VSI we care about * @q_vector: the vector on which to force writeback
**/ staticvoid iavf_force_wb(struct iavf_vsi *vsi, struct iavf_q_vector *q_vector)
{
u32 val = IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK | /* set noitr */
IAVF_VFINT_DYN_CTLN1_SWINT_TRIG_MASK |
IAVF_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK /* allow 00 to be written to the index */;
/** * iavf_detect_recover_hung - Function to detect and recover hung_queues * @vsi: pointer to vsi struct with tx queues * * VSI has netdev and netdev has TX queues. This function is to check each of * those TX queues if they are hung, trigger recovery by issuing SW interrupt.
**/ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
{ struct iavf_ring *tx_ring = NULL; struct net_device *netdev; unsignedint i; int packets;
if (!vsi) return;
if (test_bit(__IAVF_VSI_DOWN, vsi->state)) return;
netdev = vsi->netdev; if (!netdev) return;
if (!netif_carrier_ok(netdev)) return;
for (i = 0; i < vsi->back->num_active_queues; i++) {
tx_ring = &vsi->back->tx_rings[i]; if (tx_ring && tx_ring->desc) { /* If packet counter has not changed the queue is * likely stalled, so force an interrupt for this * queue. * * prev_pkt_ctr would be negative if there was no * pending work.
*/
packets = tx_ring->stats.packets & INT_MAX; if (tx_ring->prev_pkt_ctr == packets) {
iavf_force_wb(vsi, tx_ring->q_vector); continue;
}
/* Memory barrier between read of packet count and call * to iavf_get_tx_pending()
*/
smp_rmb();
tx_ring->prev_pkt_ctr =
iavf_get_tx_pending(tx_ring, true) ? packets : -1;
}
}
}
#define WB_STRIDE 4
/** * iavf_clean_tx_irq - Reclaim resources after transmit completes * @vsi: the VSI we care about * @tx_ring: Tx ring to clean * @napi_budget: Used to determine if we are in netpoll * * Returns true if there's any budget left (e.g. the clean is finished)
**/ staticbool iavf_clean_tx_irq(struct iavf_vsi *vsi, struct iavf_ring *tx_ring, int napi_budget)
{ int i = tx_ring->next_to_clean; struct iavf_tx_buffer *tx_buf; struct iavf_tx_desc *tx_desc; unsignedint total_bytes = 0, total_packets = 0; unsignedint budget = IAVF_DEFAULT_IRQ_WORK;
tx_buf = &tx_ring->tx_bi[i];
tx_desc = IAVF_TX_DESC(tx_ring, i);
i -= tx_ring->count;
do { struct iavf_tx_desc *eop_desc = tx_buf->next_to_watch;
/* if next_to_watch is not set then there is no work pending */ if (!eop_desc) break;
/* prevent any other reads prior to eop_desc */
smp_rmb();
iavf_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf); /* if the descriptor isn't done, no work yet to do */ if (!(eop_desc->cmd_type_offset_bsz &
cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE))) break;
tx_buf++;
tx_desc++;
i++; if (unlikely(!i)) {
i -= tx_ring->count;
tx_buf = tx_ring->tx_bi;
tx_desc = IAVF_TX_DESC(tx_ring, 0);
}
/* unmap any remaining paged data */ if (dma_unmap_len(tx_buf, len)) {
dma_unmap_page(tx_ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
dma_unmap_len_set(tx_buf, len, 0);
}
}
/* move us one more past the eop_desc for start of next pkt */
tx_buf++;
tx_desc++;
i++; if (unlikely(!i)) {
i -= tx_ring->count;
tx_buf = tx_ring->tx_bi;
tx_desc = IAVF_TX_DESC(tx_ring, 0);
}
prefetch(tx_desc);
/* update budget accounting */
budget--;
} while (likely(budget));
if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) { /* check to see if there are < 4 descriptors * waiting to be written back, then kick the hardware to force * them to be written back in case we stay in NAPI. * In this mode on X722 we do not enable Interrupt.
*/ unsignedint j = iavf_get_tx_pending(tx_ring, false);
/* notify netdev of completed buffers */
netdev_tx_completed_queue(txring_txq(tx_ring),
total_packets, total_bytes);
#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2)) if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
(IAVF_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) { /* Make sure that anybody stopping the queue after this * sees the new next_to_clean.
*/
smp_mb(); if (__netif_subqueue_stopped(tx_ring->netdev,
tx_ring->queue_index) &&
!test_bit(__IAVF_VSI_DOWN, vsi->state)) {
netif_wake_subqueue(tx_ring->netdev,
tx_ring->queue_index);
++tx_ring->tx_stats.restart_queue;
}
}
return !!budget;
}
/** * iavf_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled * @vsi: the VSI we care about * @q_vector: the vector on which to enable writeback *
**/ staticvoid iavf_enable_wb_on_itr(struct iavf_vsi *vsi, struct iavf_q_vector *q_vector)
{
u16 flags = q_vector->tx.ring[0].flags;
u32 val;
if (!(flags & IAVF_TXR_FLAGS_WB_ON_ITR)) return;
if (q_vector->arm_wb_state) return;
val = IAVF_VFINT_DYN_CTLN1_WB_ON_ITR_MASK |
IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK; /* set noitr */
staticunsignedint iavf_mbps_itr_multiplier(u32 speed_mbps)
{ switch (speed_mbps) { case SPEED_100000: return IAVF_AIM_MULTIPLIER_100G; case SPEED_50000: return IAVF_AIM_MULTIPLIER_50G; case SPEED_40000: return IAVF_AIM_MULTIPLIER_40G; case SPEED_25000: case SPEED_20000: return IAVF_AIM_MULTIPLIER_20G; case SPEED_10000: default: return IAVF_AIM_MULTIPLIER_10G; case SPEED_1000: case SPEED_100: return IAVF_AIM_MULTIPLIER_1G;
}
}
staticunsignedint
iavf_virtchnl_itr_multiplier(enum virtchnl_link_speed speed_virtchnl)
{ switch (speed_virtchnl) { case VIRTCHNL_LINK_SPEED_40GB: return IAVF_AIM_MULTIPLIER_40G; case VIRTCHNL_LINK_SPEED_25GB: case VIRTCHNL_LINK_SPEED_20GB: return IAVF_AIM_MULTIPLIER_20G; case VIRTCHNL_LINK_SPEED_10GB: default: return IAVF_AIM_MULTIPLIER_10G; case VIRTCHNL_LINK_SPEED_1GB: case VIRTCHNL_LINK_SPEED_100MB: return IAVF_AIM_MULTIPLIER_1G;
}
}
/** * iavf_update_itr - update the dynamic ITR value based on statistics * @q_vector: structure containing interrupt and ring information * @rc: structure containing ring performance data * * Stores a new ITR value based on packets and byte * counts during the last interrupt. The advantage of per interrupt * computation is faster updates and more accurate ITR for the current * traffic pattern. Constants in this function were computed * based on theoretical maximum wire speed and thresholds were set based * on testing data as well as attempting to minimize response time * while increasing bulk throughput.
**/ staticvoid iavf_update_itr(struct iavf_q_vector *q_vector, struct iavf_ring_container *rc)
{ unsignedint avg_wire_size, packets, bytes, itr; unsignedlong next_update = jiffies;
/* If we don't have any rings just leave ourselves set for maximum * possible latency so we take ourselves out of the equation.
*/ if (!rc->ring || !ITR_IS_DYNAMIC(rc->ring->itr_setting)) return;
/* For Rx we want to push the delay up and default to low latency. * for Tx we want to pull the delay down and default to high latency.
*/
itr = iavf_container_is_rx(q_vector, rc) ?
IAVF_ITR_ADAPTIVE_MIN_USECS | IAVF_ITR_ADAPTIVE_LATENCY :
IAVF_ITR_ADAPTIVE_MAX_USECS | IAVF_ITR_ADAPTIVE_LATENCY;
/* If we didn't update within up to 1 - 2 jiffies we can assume * that either packets are coming in so slow there hasn't been * any work, or that there is so much work that NAPI is dealing * with interrupt moderation and we don't need to do anything.
*/ if (time_after(next_update, rc->next_update)) goto clear_counts;
/* If itr_countdown is set it means we programmed an ITR within * the last 4 interrupt cycles. This has a side effect of us * potentially firing an early interrupt. In order to work around * this we need to throw out any data received for a few * interrupts following the update.
*/ if (q_vector->itr_countdown) {
itr = rc->target_itr; goto clear_counts;
}
if (iavf_container_is_rx(q_vector, rc)) { /* If Rx there are 1 to 4 packets and bytes are less than * 9000 assume insufficient data to use bulk rate limiting * approach unless Tx is already in bulk rate limiting. We * are likely latency driven.
*/ if (packets && packets < 4 && bytes < 9000 &&
(q_vector->tx.target_itr & IAVF_ITR_ADAPTIVE_LATENCY)) {
itr = IAVF_ITR_ADAPTIVE_LATENCY; goto adjust_by_size;
}
} elseif (packets < 4) { /* If we have Tx and Rx ITR maxed and Tx ITR is running in * bulk mode and we are receiving 4 or fewer packets just * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so * that the Rx can relax.
*/ if (rc->target_itr == IAVF_ITR_ADAPTIVE_MAX_USECS &&
(q_vector->rx.target_itr & IAVF_ITR_MASK) ==
IAVF_ITR_ADAPTIVE_MAX_USECS) goto clear_counts;
} elseif (packets > 32) { /* If we have processed over 32 packets in a single interrupt * for Tx assume we need to switch over to "bulk" mode.
*/
rc->target_itr &= ~IAVF_ITR_ADAPTIVE_LATENCY;
}
/* We have no packets to actually measure against. This means * either one of the other queues on this vector is active or * we are a Tx queue doing TSO with too high of an interrupt rate. * * Between 4 and 56 we can assume that our current interrupt delay * is only slightly too low. As such we should increase it by a small * fixed amount.
*/ if (packets < 56) {
itr = rc->target_itr + IAVF_ITR_ADAPTIVE_MIN_INC; if ((itr & IAVF_ITR_MASK) > IAVF_ITR_ADAPTIVE_MAX_USECS) {
itr &= IAVF_ITR_ADAPTIVE_LATENCY;
itr += IAVF_ITR_ADAPTIVE_MAX_USECS;
} goto clear_counts;
}
/* Between 56 and 112 is our "goldilocks" zone where we are * working out "just right". Just report that our current * ITR is good for us.
*/ if (packets <= 112) goto clear_counts;
/* If packet count is 128 or greater we are likely looking * at a slight overrun of the delay we want. Try halving * our delay to see if that will cut the number of packets * in half per interrupt.
*/
itr /= 2;
itr &= IAVF_ITR_MASK; if (itr < IAVF_ITR_ADAPTIVE_MIN_USECS)
itr = IAVF_ITR_ADAPTIVE_MIN_USECS;
goto clear_counts;
}
/* The paths below assume we are dealing with a bulk ITR since * number of packets is greater than 256. We are just going to have * to compute a value and try to bring the count under control, * though for smaller packet sizes there isn't much we can do as * NAPI polling will likely be kicking in sooner rather than later.
*/
itr = IAVF_ITR_ADAPTIVE_BULK;
adjust_by_size: /* If packet counts are 256 or greater we can assume we have a gross * overestimation of what the rate should be. Instead of trying to fine * tune it just use the formula below to try and dial in an exact value * give the current packet size of the frame.
*/
avg_wire_size = bytes / packets;
/* The following is a crude approximation of: * wmem_default / (size + overhead) = desired_pkts_per_int * rate / bits_per_byte / (size + ethernet overhead) = pkt_rate * (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value * * Assuming wmem_default is 212992 and overhead is 640 bytes per * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the * formula down to * * (170 * (size + 24)) / (size + 640) = ITR * * We first do some math on the packet size and then finally bitshift * by 8 after rounding up. We also have to account for PCIe link speed * difference as ITR scales based on this.
*/ if (avg_wire_size <= 60) { /* Start at 250k ints/sec */
avg_wire_size = 4096;
} elseif (avg_wire_size <= 380) { /* 250K ints/sec to 60K ints/sec */
avg_wire_size *= 40;
avg_wire_size += 1696;
} elseif (avg_wire_size <= 1084) { /* 60K ints/sec to 36K ints/sec */
avg_wire_size *= 15;
avg_wire_size += 11452;
} elseif (avg_wire_size <= 1980) { /* 36K ints/sec to 30K ints/sec */
avg_wire_size *= 5;
avg_wire_size += 22420;
} else { /* plateau at a limit of 30K ints/sec */
avg_wire_size = 32256;
}
/* If we are in low latency mode halve our delay which doubles the * rate to somewhere between 100K to 16K ints/sec
*/ if (itr & IAVF_ITR_ADAPTIVE_LATENCY)
avg_wire_size /= 2;
/* Resultant value is 256 times larger than it needs to be. This * gives us room to adjust the value as needed to either increase * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc. * * Use addition as we have already recorded the new latency flag * for the ITR value.
*/
itr += DIV_ROUND_UP(avg_wire_size,
iavf_itr_divisor(q_vector->adapter)) *
IAVF_ITR_ADAPTIVE_MIN_INC;
clear_counts: /* write back value */
rc->target_itr = itr;
/* next update should occur within next jiffy */
rc->next_update = next_update + 1;
rc->total_bytes = 0;
rc->total_packets = 0;
}
/** * iavf_setup_tx_descriptors - Allocate the Tx descriptors * @tx_ring: the tx ring to set up * * Return 0 on success, negative on error
**/ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
{ struct device *dev = tx_ring->dev; int bi_size;
if (!dev) return -ENOMEM;
/* warn if we are about to overwrite the pointer */
WARN_ON(tx_ring->tx_bi);
bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL); if (!tx_ring->tx_bi) goto err;
/* round up to nearest 4K */
tx_ring->size = tx_ring->count * sizeof(struct iavf_tx_desc);
tx_ring->size = ALIGN(tx_ring->size, 4096);
tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
&tx_ring->dma, GFP_KERNEL); if (!tx_ring->desc) {
dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
tx_ring->size); goto err;
}
/** * iavf_clean_rx_ring - Free Rx buffers * @rx_ring: ring to be cleaned
**/ staticvoid iavf_clean_rx_ring(struct iavf_ring *rx_ring)
{ /* ring already cleared, nothing to do */ if (!rx_ring->rx_fqes) return;
if (rx_ring->skb) {
dev_kfree_skb(rx_ring->skb);
rx_ring->skb = NULL;
}
/* Free all the Rx ring buffers */ for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) { conststruct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i];
/** * iavf_release_rx_desc - Store the new tail and head values * @rx_ring: ring to bump * @val: new head index
**/ staticvoid iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
{
rx_ring->next_to_use = val;
/* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only * applicable for weak-ordered memory model archs, * such as IA-64).
*/
wmb();
writel(val, rx_ring->tail);
}
/** * iavf_receive_skb - Send a completed packet up the stack * @rx_ring: rx ring in play * @skb: packet to send up * @vlan_tag: vlan tag for packet
**/ staticvoid iavf_receive_skb(struct iavf_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
{ struct iavf_q_vector *q_vector = rx_ring->q_vector;
/* clear the status bits for the next_to_use descriptor */
rx_desc->qw1 = 0;
cleaned_count--;
} while (cleaned_count);
if (rx_ring->next_to_use != ntu)
iavf_release_rx_desc(rx_ring, ntu);
returnfalse;
no_buffers: if (rx_ring->next_to_use != ntu)
iavf_release_rx_desc(rx_ring, ntu);
rx_ring->rx_stats.alloc_page_failed++;
/* make sure to come back via polling to try again after * allocation failure
*/ returntrue;
}
/** * iavf_rx_csum - Indicate in skb if hw indicated a good checksum * @vsi: the VSI we care about * @skb: skb currently being received and modified * @decoded_pt: decoded ptype information * @csum_bits: decoded Rx descriptor information
**/ staticvoid iavf_rx_csum(conststruct iavf_vsi *vsi, struct sk_buff *skb, struct libeth_rx_pt decoded_pt, struct libeth_rx_csum csum_bits)
{ bool ipv4, ipv6;
skb->ip_summed = CHECKSUM_NONE;
/* did the hardware decode the packet and checksum? */ if (unlikely(!csum_bits.l3l4p)) return;
if (unlikely(ipv4 && (csum_bits.ipe || csum_bits.eipe))) goto checksum_fail;
/* likely incorrect csum if alternate IP extension headers found */ if (unlikely(ipv6 && csum_bits.ipv6exadd)) return;
/* there was some L4 error, count error and punt packet to the stack */ if (unlikely(csum_bits.l4e)) goto checksum_fail;
/* handle packets that were not able to be checksummed due * to arrival speed, in this case the stack can compute * the csum.
*/ if (unlikely(csum_bits.pprs)) return;
skb->ip_summed = CHECKSUM_UNNECESSARY; return;
checksum_fail:
vsi->back->hw_csum_rx_error++;
}
/** * iavf_legacy_rx_csum - Indicate in skb if hw indicated a good checksum * @vsi: the VSI we care about * @qw1: quad word 1 * @decoded_pt: decoded packet type * * This function only operates on the VIRTCHNL_RXDID_1_32B_BASE legacy 32byte * descriptor writeback format. * * Return: decoded checksum bits.
**/ staticstruct libeth_rx_csum
iavf_legacy_rx_csum(conststruct iavf_vsi *vsi, u64 qw1, conststruct libeth_rx_pt decoded_pt)
{ struct libeth_rx_csum csum_bits = {};
if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded_pt)) return csum_bits;
/** * iavf_flex_rx_csum - Indicate in skb if hw indicated a good checksum * @vsi: the VSI we care about * @qw1: quad word 1 * @decoded_pt: decoded packet type * * This function only operates on the VIRTCHNL_RXDID_2_FLEX_SQ_NIC flexible * descriptor writeback format. * * Return: decoded checksum bits.
**/ staticstruct libeth_rx_csum
iavf_flex_rx_csum(conststruct iavf_vsi *vsi, u64 qw1, conststruct libeth_rx_pt decoded_pt)
{ struct libeth_rx_csum csum_bits = {};
if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded_pt)) return csum_bits;
/** * iavf_legacy_rx_hash - set the hash value in the skb * @ring: descriptor ring * @qw0: quad word 0 * @qw1: quad word 1 * @skb: skb currently being received and modified * @decoded_pt: decoded packet type * * This function only operates on the VIRTCHNL_RXDID_1_32B_BASE legacy 32byte * descriptor writeback format.
**/ staticvoid iavf_legacy_rx_hash(conststruct iavf_ring *ring, __le64 qw0,
__le64 qw1, struct sk_buff *skb, conststruct libeth_rx_pt decoded_pt)
{ const __le64 rss_mask = cpu_to_le64(IAVF_RXD_LEGACY_FLTSTAT_M);
u32 hash;
if (!libeth_rx_pt_has_hash(ring->netdev, decoded_pt)) return;
/** * iavf_flex_rx_hash - set the hash value in the skb * @ring: descriptor ring * @qw1: quad word 1 * @skb: skb currently being received and modified * @decoded_pt: decoded packet type * * This function only operates on the VIRTCHNL_RXDID_2_FLEX_SQ_NIC flexible * descriptor writeback format.
**/ staticvoid iavf_flex_rx_hash(conststruct iavf_ring *ring, __le64 qw1, struct sk_buff *skb, conststruct libeth_rx_pt decoded_pt)
{ bool rss_valid;
u32 hash;
if (!libeth_rx_pt_has_hash(ring->netdev, decoded_pt)) return;
/** * iavf_flex_rx_tstamp - Capture Rx timestamp from the descriptor * @rx_ring: descriptor ring * @qw2: quad word 2 of descriptor * @qw3: quad word 3 of descriptor * @skb: skb currently being received * * Read the Rx timestamp value from the descriptor and pass it to the stack. * * This function only operates on the VIRTCHNL_RXDID_2_FLEX_SQ_NIC flexible * descriptor writeback format.
*/ staticvoid iavf_flex_rx_tstamp(conststruct iavf_ring *rx_ring, __le64 qw2,
__le64 qw3, struct sk_buff *skb)
{
u32 tstamp;
u64 ns;
/* Skip processing if timestamps aren't enabled */ if (!(rx_ring->flags & IAVF_TXRX_FLAGS_HW_TSTAMP)) return;
/* Check if this Rx descriptor has a valid timestamp */ if (!le64_get_bits(qw2, IAVF_PTP_40B_TSTAMP_VALID)) return;
/* the ts_low field only contains the valid bit and sub-nanosecond * precision, so we don't need to extract it.
*/
tstamp = le64_get_bits(qw3, IAVF_RXD_FLEX_QW3_TSTAMP_HIGH_M);
/** * iavf_process_skb_fields - Populate skb header fields from Rx descriptor * @rx_ring: rx descriptor ring packet is being transacted on * @rx_desc: pointer to the EOP Rx descriptor * @skb: pointer to current skb being populated * @ptype: the packet type decoded by hardware * @flex: is the descriptor flex or legacy * * This function checks the ring, descriptor, and packet information in * order to populate the hash, checksum, VLAN, protocol, and * other fields within the skb.
**/ staticvoid iavf_process_skb_fields(conststruct iavf_ring *rx_ring, conststruct iavf_rx_desc *rx_desc, struct sk_buff *skb, u32 ptype, bool flex)
{ struct libeth_rx_csum csum_bits; struct libeth_rx_pt decoded_pt;
__le64 qw0 = rx_desc->qw0;
__le64 qw1 = rx_desc->qw1;
__le64 qw2 = rx_desc->qw2;
__le64 qw3 = rx_desc->qw3;
/* modifies the skb - consumes the enet header */
skb->protocol = eth_type_trans(skb, rx_ring->netdev);
}
/** * iavf_cleanup_headers - Correct empty headers * @rx_ring: rx descriptor ring packet is being transacted on * @skb: pointer to current skb being fixed * * Also address the case where we are pulling data in on pages only * and as such no data is present in the skb header. * * In addition if skb is not at least 60 bytes we need to pad it so that * it is large enough to qualify as a valid Ethernet frame. * * Returns true if an error was encountered and skb was freed.
**/ staticbool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
{ /* if eth_skb_pad returns an error the skb was freed */ if (eth_skb_pad(skb)) returntrue;
returnfalse;
}
/** * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff * @skb: sk_buff to place the data into * @rx_buffer: buffer containing page to add * @size: packet length from rx_desc * * This function will add the data contained in rx_buffer->page to the skb. * It will just attach the page as a frag to the skb. * * The function will then update the page offset.
**/ staticvoid iavf_add_rx_frag(struct sk_buff *skb, conststruct libeth_fqe *rx_buffer, unsignedint size)
{
u32 hr = netmem_get_pp(rx_buffer->netmem)->p.offset;
/** * iavf_build_skb - Build skb around an existing buffer * @rx_buffer: Rx buffer to pull data from * @size: size of buffer to add to skb * * This function builds an skb around an existing Rx buffer, taking care * to set up the skb correctly and avoid any memcpy overhead.
*/ staticstruct sk_buff *iavf_build_skb(conststruct libeth_fqe *rx_buffer, unsignedint size)
{ struct page *buf_page = __netmem_to_page(rx_buffer->netmem);
u32 hr = pp_page_to_nmdesc(buf_page)->pp->p.offset; struct sk_buff *skb; void *va;
/* prefetch first cache line of first page */
va = page_address(buf_page) + rx_buffer->offset;
net_prefetch(va + hr);
/* build an skb around the page buffer */
skb = napi_build_skb(va, rx_buffer->truesize); if (unlikely(!skb)) return NULL;
skb_mark_for_recycle(skb);
/* update pointers within the skb to store the data */
skb_reserve(skb, hr);
__skb_put(skb, size);
return skb;
}
/** * iavf_is_non_eop - process handling of non-EOP buffers * @rx_ring: Rx ring being processed * @fields: Rx descriptor extracted fields * * This function updates next to clean. If the buffer is an EOP buffer * this function exits returning false, otherwise it will place the * sk_buff in the next buffer to be chained and return true indicating * that this is in fact a non-EOP buffer.
**/ staticbool iavf_is_non_eop(struct iavf_ring *rx_ring, struct libeth_rqe_info fields)
{
u32 ntc = rx_ring->next_to_clean + 1;
/* fetch, update, and store next to clean */
ntc = (ntc < rx_ring->count) ? ntc : 0;
rx_ring->next_to_clean = ntc;
prefetch(IAVF_RX_DESC(rx_ring, ntc));
/* if we are the last buffer then there is nothing else to do */ if (likely(fields.eop)) returnfalse;
rx_ring->rx_stats.non_eop_descs++;
returntrue;
}
/** * iavf_extract_legacy_rx_fields - Extract fields from the Rx descriptor * @rx_ring: rx descriptor ring * @rx_desc: the descriptor to process * * Decode the Rx descriptor and extract relevant information including the * size, VLAN tag, Rx packet type, end of packet field and RXE field value. * * This function only operates on the VIRTCHNL_RXDID_1_32B_BASE legacy 32byte * descriptor writeback format. * * Return: fields extracted from the Rx descriptor.
*/ staticstruct libeth_rqe_info
iavf_extract_legacy_rx_fields(conststruct iavf_ring *rx_ring, conststruct iavf_rx_desc *rx_desc)
{
u64 qw0 = le64_to_cpu(rx_desc->qw0);
u64 qw1 = le64_to_cpu(rx_desc->qw1);
u64 qw2 = le64_to_cpu(rx_desc->qw2); struct libeth_rqe_info fields; bool l2tag1p, l2tag2p;
/** * iavf_extract_flex_rx_fields - Extract fields from the Rx descriptor * @rx_ring: rx descriptor ring * @rx_desc: the descriptor to process * * Decode the Rx descriptor and extract relevant information including the * size, VLAN tag, Rx packet type, end of packet field and RXE field value. * * This function only operates on the VIRTCHNL_RXDID_2_FLEX_SQ_NIC flexible * descriptor writeback format. * * Return: fields extracted from the Rx descriptor.
*/ staticstruct libeth_rqe_info
iavf_extract_flex_rx_fields(conststruct iavf_ring *rx_ring, conststruct iavf_rx_desc *rx_desc)
{ struct libeth_rqe_info fields = {};
u64 qw0 = le64_to_cpu(rx_desc->qw0);
u64 qw1 = le64_to_cpu(rx_desc->qw1);
u64 qw2 = le64_to_cpu(rx_desc->qw2); bool l2tag1p, l2tag2p;
/** * iavf_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf * @rx_ring: rx descriptor ring to transact packets on * @budget: Total limit on number of packets to process * * This function provides a "bounce buffer" approach to Rx interrupt * processing. The advantage to this is that on systems that have * expensive overhead for IOMMU access this provides a means of avoiding * it by maintaining the mapping of the page to the system. * * Returns amount of work completed
**/ staticint iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
{ bool flex = rx_ring->rxdid == VIRTCHNL_RXDID_2_FLEX_SQ_NIC; unsignedint total_rx_bytes = 0, total_rx_packets = 0; struct sk_buff *skb = rx_ring->skb;
u16 cleaned_count = IAVF_DESC_UNUSED(rx_ring); bool failure = false;
/* return some buffers to hardware, one at a time is too slow */ if (cleaned_count >= IAVF_RX_BUFFER_WRITE) {
failure = failure ||
iavf_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}
/* This memory barrier is needed to keep us from reading * any other fields out of the rx_desc until we have * verified the descriptor has been written back.
*/
dma_rmb();
qw1 = le64_to_cpu(rx_desc->qw1); /* If DD field (descriptor done) is unset then other fields are * not valid
*/ if (!iavf_is_descriptor_done(qw1, flex)) break;
rx_buffer = &rx_ring->rx_fqes[rx_ring->next_to_clean]; if (!libeth_rx_sync_for_cpu(rx_buffer, fields.len)) goto skip_data;
/* retrieve a buffer from the ring */ if (skb)
iavf_add_rx_frag(skb, rx_buffer, fields.len); else
skb = iavf_build_skb(rx_buffer, fields.len);
/* exit if we failed to retrieve a buffer */ if (!skb) {
rx_ring->rx_stats.alloc_buff_failed++; break;
}
skip_data:
cleaned_count++;
if (iavf_is_non_eop(rx_ring, fields) || unlikely(!skb)) continue;
/* RXE field in descriptor is an indication of the MAC errors * (like CRC, alignment, oversize etc). If it is set then iavf * should finish.
*/ if (unlikely(fields.rxe)) {
dev_kfree_skb_any(skb);
skb = NULL; continue;
}
if (iavf_cleanup_headers(rx_ring, skb)) {
skb = NULL; continue;
}
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
/* We don't bother with setting the CLEARPBA bit as the data sheet * points out doing so is "meaningless since it was already * auto-cleared". The auto-clearing happens when the interrupt is * asserted. * * Hardware errata 28 for also indicates that writing to a * xxINT_DYN_CTLx CSR with INTENA_MSK (bit 31) set to 0 will clear * an event in the PBA anyway so we need to rely on the automask * to hold pending events for us until the interrupt is re-enabled * * The itr value is reported in microseconds, and the register * value is recorded in 2 microsecond units. For this reason we * only need to shift by the interval shift - 1 instead of the * full value.
*/
itr &= IAVF_ITR_MASK;
/* a small macro to shorten up some long lines */ #define INTREG IAVF_VFINT_DYN_CTLN1
/* The act of updating the ITR will cause it to immediately trigger. In order * to prevent this from throwing off adaptive update statistics we defer the * update so that it can only happen so often. So after either Tx or Rx are * updated we make the adaptive scheme wait until either the ITR completely * expires via the next_update expiration or we have been through at least * 3 interrupts.
*/ #define ITR_COUNTDOWN_START 3
/** * iavf_update_enable_itr - Update itr and re-enable MSIX interrupt * @vsi: the VSI we care about * @q_vector: q_vector for which itr is being updated and interrupt enabled *
**/ staticvoid iavf_update_enable_itr(struct iavf_vsi *vsi, struct iavf_q_vector *q_vector)
{ struct iavf_hw *hw = &vsi->back->hw;
u32 intval;
/* These will do nothing if dynamic updates are not enabled */
iavf_update_itr(q_vector, &q_vector->tx);
iavf_update_itr(q_vector, &q_vector->rx);
/* This block of logic allows us to get away with only updating * one ITR value with each interrupt. The idea is to perform a * pseudo-lazy update with the following criteria. * * 1. Rx is given higher priority than Tx if both are in same state * 2. If we must reduce an ITR that is given highest priority. * 3. We then give priority to increasing ITR based on amount.
*/ if (q_vector->rx.target_itr < q_vector->rx.current_itr) { /* Rx ITR needs to be reduced, this is highest priority */
intval = iavf_buildreg_itr(IAVF_RX_ITR,
q_vector->rx.target_itr);
q_vector->rx.current_itr = q_vector->rx.target_itr;
q_vector->itr_countdown = ITR_COUNTDOWN_START;
} elseif ((q_vector->tx.target_itr < q_vector->tx.current_itr) ||
((q_vector->rx.target_itr - q_vector->rx.current_itr) <
(q_vector->tx.target_itr - q_vector->tx.current_itr))) { /* Tx ITR needs to be reduced, this is second priority * Tx ITR needs to be increased more than Rx, fourth priority
*/
intval = iavf_buildreg_itr(IAVF_TX_ITR,
q_vector->tx.target_itr);
q_vector->tx.current_itr = q_vector->tx.target_itr;
q_vector->itr_countdown = ITR_COUNTDOWN_START;
} elseif (q_vector->rx.current_itr != q_vector->rx.target_itr) { /* Rx ITR needs to be increased, third priority */
intval = iavf_buildreg_itr(IAVF_RX_ITR,
q_vector->rx.target_itr);
q_vector->rx.current_itr = q_vector->rx.target_itr;
q_vector->itr_countdown = ITR_COUNTDOWN_START;
} else { /* No ITR update, lowest priority */
intval = iavf_buildreg_itr(IAVF_ITR_NONE, 0); if (q_vector->itr_countdown)
q_vector->itr_countdown--;
}
if (!test_bit(__IAVF_VSI_DOWN, vsi->state))
wr32(hw, INTREG(q_vector->reg_idx), intval);
}
/** * iavf_napi_poll - NAPI polling Rx/Tx cleanup routine * @napi: napi struct with our devices info in it * @budget: amount of work driver is allowed to do this pass, in packets * * This function will clean all queues associated with a q_vector. * * Returns the amount of work done
**/ int iavf_napi_poll(struct napi_struct *napi, int budget)
{ struct iavf_q_vector *q_vector =
container_of(napi, struct iavf_q_vector, napi); struct iavf_vsi *vsi = q_vector->vsi; struct iavf_ring *ring; bool clean_complete = true; bool arm_wb = false; int budget_per_ring; int work_done = 0;
if (test_bit(__IAVF_VSI_DOWN, vsi->state)) {
napi_complete(napi); return 0;
}
/* Since the actual Tx work is minimal, we can give the Tx a larger * budget and be more aggressive about cleaning up the Tx descriptors.
*/
iavf_for_each_ring(ring, q_vector->tx) { if (!iavf_clean_tx_irq(vsi, ring, budget)) {
clean_complete = false; continue;
}
arm_wb |= !!(ring->flags & IAVF_TXR_FLAGS_ARM_WB);
ring->flags &= ~IAVF_TXR_FLAGS_ARM_WB;
}
/* Handle case where we are called by netpoll with a budget of 0 */ if (budget <= 0) goto tx_only;
/* We attempt to distribute budget to each Rx queue fairly, but don't * allow the budget to go below 1 because that would exit polling early.
*/
budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
iavf_for_each_ring(ring, q_vector->rx) { int cleaned = iavf_clean_rx_irq(ring, budget_per_ring);
work_done += cleaned; /* if we clean as many as budgeted, we must not be done */ if (cleaned >= budget_per_ring)
clean_complete = false;
}
/* If work not completed, return budget and polling will return */ if (!clean_complete) { int cpu_id = smp_processor_id();
/* It is possible that the interrupt affinity has changed but, * if the cpu is pegged at 100%, polling will never exit while * traffic continues and the interrupt will be stuck on this * cpu. We check to make sure affinity is correct before we * continue to poll, otherwise we must stop polling so the * interrupt can move to the correct cpu.
*/ if (!cpumask_test_cpu(cpu_id,
&q_vector->napi.config->affinity_mask)) { /* Tell napi that we are done polling */
napi_complete_done(napi, work_done);
/* Force an interrupt */
iavf_force_wb(vsi, q_vector);
/* Return budget-1 so that polling stops */ return budget - 1;
}
tx_only: if (arm_wb) {
q_vector->tx.ring[0].tx_stats.tx_force_wb++;
iavf_enable_wb_on_itr(vsi, q_vector);
} return budget;
}
if (vsi->back->flags & IAVF_TXR_FLAGS_WB_ON_ITR)
q_vector->arm_wb_state = false;
/* Exit the polling mode, but don't re-enable interrupts if stack might * poll us due to busy-polling
*/ if (likely(napi_complete_done(napi, work_done)))
iavf_update_enable_itr(vsi, q_vector);
return min_t(int, work_done, budget - 1);
}
/** * iavf_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW * @skb: send buffer * @tx_ring: ring to send buffer on * @flags: the tx flags to be set * * Checks the skb and set up correspondingly several generic transmit flags * related to VLAN tagging for the HW, such as VLAN, DCB, etc. * * Returns error code indicate the frame should be dropped upon error and the * otherwise returns 0 to indicate the flags has been set properly.
**/ staticvoid iavf_tx_prepare_vlan_flags(struct sk_buff *skb, struct iavf_ring *tx_ring, u32 *flags)
{
u32 tx_flags = 0;
/* stack will only request hardware VLAN insertion offload for protocols * that the driver supports and has enabled
*/ if (!skb_vlan_tag_present(skb)) return;
/* indicate if we need to offload outer UDP header */ if ((*tx_flags & IAVF_TX_FLAGS_TSO) &&
!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
tunnel |= IAVF_TXD_CTX_QW0_L4T_CS_MASK;
/* record tunnel offload values */
*cd_tunneling |= tunnel;
/* switch L4 header pointer from outer to inner */
l4.hdr = skb_inner_transport_header(skb);
l4_proto = 0;
/* reset type as we transition from outer to inner headers */
*tx_flags &= ~(IAVF_TX_FLAGS_IPV4 | IAVF_TX_FLAGS_IPV6); if (ip.v4->version == 4)
*tx_flags |= IAVF_TX_FLAGS_IPV4; if (ip.v6->version == 6)
*tx_flags |= IAVF_TX_FLAGS_IPV6;
}
/* Enable IP checksum offloads */ if (*tx_flags & IAVF_TX_FLAGS_IPV4) {
l4_proto = ip.v4->protocol; /* the stack computes the IP header already, the only time we * need the hardware to recompute it is in the case of TSO.
*/
cmd |= (*tx_flags & IAVF_TX_FLAGS_TSO) ?
IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM :
IAVF_TX_DESC_CMD_IIPT_IPV4;
} elseif (*tx_flags & IAVF_TX_FLAGS_IPV6) {
cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
/** * __iavf_chk_linearize - Check if there are more than 8 buffers per packet * @skb: send buffer * * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire * and so we need to figure out the cases where we need to linearize the skb. * * For TSO we need to count the TSO header and segment payload separately. * As such we need to check cases where we have 7 fragments or more as we * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for * the segment payload in the first descriptor, and another 7 for the * fragments.
**/ bool __iavf_chk_linearize(struct sk_buff *skb)
{ const skb_frag_t *frag, *stale; int nr_frags, sum;
/* no need to check if number of frags is less than 7 */
nr_frags = skb_shinfo(skb)->nr_frags; if (nr_frags < (IAVF_MAX_BUFFER_TXD - 1)) returnfalse;
/* We need to walk through the list and validate that each group * of 6 fragments totals at least gso_size.
*/
nr_frags -= IAVF_MAX_BUFFER_TXD - 2;
frag = &skb_shinfo(skb)->frags[0];
/* Initialize size to the negative value of gso_size minus 1. We * use this as the worst case scenerio in which the frag ahead * of us only provides one byte which is why we are limited to 6 * descriptors for a single transmit as the header and previous * fragment are already consuming 2 descriptors.
*/
sum = 1 - skb_shinfo(skb)->gso_size;
/* Add size of frags 0 through 4 to create our initial sum */
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
/* Walk through fragments adding latest fragment, testing it, and * then removing stale fragments from the sum.
*/ for (stale = &skb_shinfo(skb)->frags[0];; stale++) { int stale_size = skb_frag_size(stale);
sum += skb_frag_size(frag++);
/* The stale fragment may present us with a smaller * descriptor than the actual fragment size. To account * for that we need to remove all the data on the front and * figure out what the remainder would be in the last * descriptor associated with the fragment.
*/ if (stale_size > IAVF_MAX_DATA_PER_TXD) { int align_pad = -(skb_frag_off(stale)) &
(IAVF_MAX_READ_REQ_SIZE - 1);
sum -= align_pad;
stale_size -= align_pad;
do {
sum -= IAVF_MAX_DATA_PER_TXD_ALIGNED;
stale_size -= IAVF_MAX_DATA_PER_TXD_ALIGNED;
} while (stale_size > IAVF_MAX_DATA_PER_TXD);
}
/* if sum is negative we failed to make sufficient progress */ if (sum < 0) returntrue;
if (!nr_frags--) break;
sum -= stale_size;
}
returnfalse;
}
/** * __iavf_maybe_stop_tx - 2nd level check for tx stop conditions * @tx_ring: the ring to be checked * @size: the size buffer we want to assure is available * * Returns -EBUSY if a stop is needed, else 0
**/ int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
{
netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index); /* Memory barrier before checking head and tail */
smp_mb();
/* Check again in a case another CPU has just made room available. */ if (likely(IAVF_DESC_UNUSED(tx_ring) < size)) return -EBUSY;
/* A reprieve! - use start_queue because it doesn't call schedule */
netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
++tx_ring->tx_stats.restart_queue; return 0;
}
/** * iavf_tx_map - Build the Tx descriptor * @tx_ring: ring to send buffer on * @skb: send buffer * @first: first buffer info buffer to use * @tx_flags: collected send information * @hdr_len: size of the packet header * @td_cmd: the command field in the descriptor * @td_offset: offset for checksum or crc
**/ staticvoid iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb, struct iavf_tx_buffer *first, u32 tx_flags, const u8 hdr_len, u32 td_cmd, u32 td_offset)
{ unsignedint data_len = skb->data_len; unsignedint size = skb_headlen(skb);
skb_frag_t *frag; struct iavf_tx_buffer *tx_bi; struct iavf_tx_desc *tx_desc;
u16 i = tx_ring->next_to_use;
u32 td_tag = 0;
dma_addr_t dma;
/* write last descriptor with RS and EOP bits */
td_cmd |= IAVF_TXD_CMD;
tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, size, td_tag);
skb_tx_timestamp(skb);
/* Force memory writes to complete before letting h/w know there * are new descriptors to fetch. * * We also use this memory barrier to make certain all of the * status bits have been updated before next_to_watch is written.
*/
wmb();
/* set next_to_watch value indicating a packet is present */
first->next_to_watch = tx_desc;
/* notify HW of packet */ if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) {
writel(i, tx_ring->tail);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.