/* NOTE: Packets bigger than 1530 are split across multiple pages and XDP needs * the buffer to be contiguous. Allow XDP to be set up only if we don't exceed * this value, keeping headroom for the 14 byte Ethernet header and two * VLAN tags (for QinQ)
*/ #define MAX_XDP_MTU (1530 - ETH_HLEN - VLAN_HLEN * 2)
/* The Cavium ThunderX network controller can *only* be found in SoCs * containing the ThunderX ARM64 CPU implementation. All accesses to the device * registers on this platform are implicitly strongly ordered with respect * to memory accesses. So writeq_relaxed() and readq_relaxed() are safe to use * with no memory barriers in this driver. The readq()/writeq() functions add * explicit ordering operation which in this case are redundant, and only * add overhead.
*/
int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx)
{ unsignedlong timeout; int ret = 0;
mutex_lock(&nic->rx_mode_mtx);
nic->pf_acked = false;
nic->pf_nacked = false;
nicvf_write_to_mbx(nic, mbx);
timeout = jiffies + msecs_to_jiffies(NIC_MBOX_MSG_TIMEOUT); /* Wait for previous message to be acked, timeout 2sec */ while (!nic->pf_acked) { if (nic->pf_nacked) {
netdev_err(nic->netdev, "PF NACK to mbox msg 0x%02x from VF%d\n",
(mbx->msg.msg & 0xFF), nic->vf_id);
ret = -EINVAL; break;
}
usleep_range(8000, 10000); if (nic->pf_acked) break; if (time_after(jiffies, timeout)) {
netdev_err(nic->netdev, "PF didn't ACK to mbox msg 0x%02x from VF%d\n",
(mbx->msg.msg & 0xFF), nic->vf_id);
ret = -EBUSY; break;
}
}
mutex_unlock(&nic->rx_mode_mtx); return ret;
}
/* Checks if VF is able to comminicate with PF * and also gets the VNIC number this VF is associated to.
*/ staticint nicvf_check_pf_ready(struct nicvf *nic)
{ union nic_mbx mbx = {};
mbx.msg.msg = NIC_MBOX_MSG_READY; if (nicvf_send_msg_to_pf(nic, &mbx)) {
netdev_err(nic->netdev, "PF didn't respond to READY msg\n"); return 0;
}
return 1;
}
staticvoid nicvf_send_cfg_done(struct nicvf *nic)
{ union nic_mbx mbx = {};
mbx.msg.msg = NIC_MBOX_MSG_CFG_DONE; if (nicvf_send_msg_to_pf(nic, &mbx)) {
netdev_err(nic->netdev, "PF didn't respond to CFG DONE msg\n");
}
}
/* Set no of Rx/Tx queues in each of the SQsets */ for (sqs = 0; sqs < nic->sqs_count; sqs++) {
mbx.nicvf.msg = NIC_MBOX_MSG_SNICVF_PTR;
mbx.nicvf.vf_id = nic->vf_id;
mbx.nicvf.sqs_id = sqs;
nicvf_send_msg_to_pf(nic, &mbx);
/* Initialize secondary Qset's queues and its interrupts */
nicvf_open(nic->snicvf[sqs]->netdev);
}
/* Update stack with actual Rx/Tx queue count allocated */ if (sqs_count != nic->sqs_count)
nicvf_set_real_num_queues(nic->netdev,
nic->tx_queues, nic->rx_queues);
}
/* Send this Qset's nicvf pointer to PF. * PF inturn sends primary VF's nicvf struct to secondary Qsets/VFs * so that packets received by these Qsets can use primary VF's netdev
*/ staticvoid nicvf_send_vf_struct(struct nicvf *nic)
{ union nic_mbx mbx = {};
int nicvf_set_real_num_queues(struct net_device *netdev, int tx_queues, int rx_queues)
{ int err = 0;
err = netif_set_real_num_tx_queues(netdev, tx_queues); if (err) {
netdev_err(netdev, "Failed to set no of Tx queues: %d\n", tx_queues); return err;
}
err = netif_set_real_num_rx_queues(netdev, rx_queues); if (err)
netdev_err(netdev, "Failed to set no of Rx queues: %d\n", rx_queues); return err;
}
staticint nicvf_init_resources(struct nicvf *nic)
{ int err;
/* Enable Qset */
nicvf_qset_config(nic, true);
/* Initialize queues and HW for data transfer */
err = nicvf_config_data_transfer(nic, true); if (err) {
netdev_err(nic->netdev, "Failed to alloc/config VF's QSet resources\n"); return err;
}
len = xdp.data_end - xdp.data; /* Check if XDP program has changed headers */ if (orig_data != xdp.data) {
offset = orig_data - xdp.data;
dma_addr -= offset;
}
switch (action) { case XDP_PASS: /* Check if it's a recycled page, if not * unmap the DMA mapping. * * Recycled page holds an extra reference.
*/ if (page_ref_count(page) == 1) {
dma_addr &= PAGE_MASK;
dma_unmap_page_attrs(&nic->pdev->dev, dma_addr,
RCV_FRAG_LEN + XDP_PACKET_HEADROOM,
DMA_FROM_DEVICE,
DMA_ATTR_SKIP_CPU_SYNC);
}
/* Build SKB and pass on packet to network stack */
*skb = build_skb(xdp.data,
RCV_FRAG_LEN - cqe_rx->align_pad + offset); if (!*skb)
put_page(page); else
skb_put(*skb, len); returnfalse; case XDP_TX:
nicvf_xdp_sq_append_pkt(nic, sq, (u64)xdp.data, dma_addr, len); returntrue; default:
bpf_warn_invalid_xdp_action(nic->netdev, prog, action);
fallthrough; case XDP_ABORTED:
trace_xdp_exception(nic->netdev, prog, action);
fallthrough; case XDP_DROP: /* Check if it's a recycled page, if not * unmap the DMA mapping. * * Recycled page holds an extra reference.
*/ if (page_ref_count(page) == 1) {
dma_addr &= PAGE_MASK;
dma_unmap_page_attrs(&nic->pdev->dev, dma_addr,
RCV_FRAG_LEN + XDP_PACKET_HEADROOM,
DMA_FROM_DEVICE,
DMA_ATTR_SKIP_CPU_SYNC);
}
put_page(page); returntrue;
} returnfalse;
}
/* New timestamp request can be queued now */
atomic_set(&nic->tx_ptp_skbs, 0);
/* Check for timestamp requested skb */ if (!nic->ptp_skb) return;
/* Check if timestamping is timedout, which is set to 10us */ if (cqe_tx->send_status == CQ_TX_ERROP_TSTMP_TIMEOUT ||
cqe_tx->send_status == CQ_TX_ERROP_TSTMP_CONFLICT) goto no_tstamp;
/* Get the timestamp */
memset(&ts, 0, sizeof(ts));
ns = cavium_ptp_tstamp2time(nic->ptp_clock, cqe_tx->ptp_timestamp);
ts.hwtstamp = ns_to_ktime(ns);
skb_tstamp_tx(nic->ptp_skb, &ts);
no_tstamp: /* Free the original skb */
dev_kfree_skb_any(nic->ptp_skb);
nic->ptp_skb = NULL; /* Sync 'ptp_skb' */
smp_wmb();
}
/* Check for errors */ if (cqe_tx->send_status)
nicvf_check_cqe_tx_errs(nic->pnicvf, cqe_tx);
/* Is this a XDP designated Tx queue */ if (sq->is_xdp) {
page = (struct page *)sq->xdp_page[cqe_tx->sqe_ptr]; /* Check if it's recycled page or else unmap DMA mapping */ if (page && (page_ref_count(page) == 1))
nicvf_unmap_sndq_buffers(nic, sq, cqe_tx->sqe_ptr,
hdr->subdesc_cnt);
/* Release page reference for recycling */ if (page)
put_page(page);
sq->xdp_page[cqe_tx->sqe_ptr] = (u64)NULL;
*subdesc_cnt += hdr->subdesc_cnt + 1; return;
}
skb = (struct sk_buff *)sq->skbuff[cqe_tx->sqe_ptr]; if (skb) { /* Check for dummy descriptor used for HW TSO offload on 88xx */ if (hdr->dont_send) { /* Get actual TSO descriptors and free them */
tso_sqe =
(struct sq_hdr_subdesc *)GET_SQ_DESC(sq, hdr->rsvd2);
nicvf_unmap_sndq_buffers(nic, sq, hdr->rsvd2,
tso_sqe->subdesc_cnt);
*subdesc_cnt += tso_sqe->subdesc_cnt + 1;
} else {
nicvf_unmap_sndq_buffers(nic, sq, cqe_tx->sqe_ptr,
hdr->subdesc_cnt);
}
*subdesc_cnt += hdr->subdesc_cnt + 1;
prefetch(skb);
(*tx_pkts)++;
*tx_bytes += skb->len; /* If timestamp is requested for this skb, don't free it */ if (skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS &&
!nic->pnicvf->ptp_skb)
nic->pnicvf->ptp_skb = skb; else
napi_consume_skb(skb, budget);
sq->skbuff[cqe_tx->sqe_ptr] = (u64)NULL;
} else { /* In case of SW TSO on 88xx, only last segment will have * a SKB attached, so just free SQEs here.
*/ if (!nic->hw_tso)
*subdesc_cnt += hdr->subdesc_cnt + 1;
}
}
if (!nic->ptp_clock || !nic->hw_rx_tstamp) return;
/* The first 8 bytes is the timestamp */
ns = cavium_ptp_tstamp2time(nic->ptp_clock,
be64_to_cpu(*(__be64 *)skb->data));
skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(ns);
spin_lock_bh(&cq->lock);
loop:
processed_cqe = 0; /* Get no of valid CQ entries to process */
cqe_count = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_STATUS, cq_idx);
cqe_count &= CQ_CQE_COUNT; if (!cqe_count) goto done;
/* Get head of the valid CQ entries */
cqe_head = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_HEAD, cq_idx) >> 9;
cqe_head &= 0xFFFF;
while (processed_cqe < cqe_count) { /* Get the CQ descriptor */
cq_desc = (struct cqe_rx_t *)GET_CQ_DESC(cq, cqe_head);
cqe_head++;
cqe_head &= (cq->dmem.q_len - 1); /* Initiate prefetch for next descriptor */
prefetch((struct cqe_rx_t *)GET_CQ_DESC(cq, cqe_head));
if ((work_done >= budget) && napi &&
(cq_desc->cqe_type != CQE_TYPE_SEND)) { break;
}
switch (cq_desc->cqe_type) { case CQE_TYPE_RX:
nicvf_rcv_pkt_handler(netdev, napi, cq_desc, sq, rq);
work_done++; break; case CQE_TYPE_SEND:
nicvf_snd_pkt_handler(netdev, (void *)cq_desc,
budget, &subdesc_cnt,
&tx_pkts, &tx_bytes);
tx_done++; break; case CQE_TYPE_SEND_PTP:
nicvf_snd_ptp_handler(netdev, (void *)cq_desc); break; case CQE_TYPE_INVALID: case CQE_TYPE_RX_SPLIT: case CQE_TYPE_RX_TCP: /* Ignore for now */ break;
}
processed_cqe++;
}
/* Ring doorbell to inform H/W to reuse processed CQEs */
nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_DOOR,
cq_idx, processed_cqe);
/* Wakeup TXQ if its stopped earlier due to SQ full */ if (tx_done ||
(atomic_read(&sq->free_cnt) >= MIN_SQ_DESC_PER_PKT_XMIT)) {
netdev = nic->pnicvf->netdev;
txq = netdev_get_tx_queue(netdev, txq_idx); if (tx_pkts)
netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
/* To read updated queue and carrier status */
smp_mb(); if (netif_tx_queue_stopped(txq) && netif_carrier_ok(netdev)) {
netif_tx_wake_queue(txq);
nic = nic->pnicvf;
this_cpu_inc(nic->drv_stats->txq_wake);
netif_warn(nic, tx_err, netdev, "Transmit queue wakeup SQ%d\n", txq_idx);
}
}
staticvoid nicvf_set_irq_affinity(struct nicvf *nic)
{ int vec, cpu;
for (vec = 0; vec < nic->num_vec; vec++) { if (!nic->irq_allocated[vec]) continue;
if (!zalloc_cpumask_var(&nic->affinity_mask[vec], GFP_KERNEL)) return; /* CQ interrupts */ if (vec < NICVF_INTR_ID_SQ) /* Leave CPU0 for RBDR and other interrupts */
cpu = nicvf_netdev_qidx(nic, vec) + 1; else
cpu = 0;
/* Initialize MSIX vectors and register MISC interrupt. * Send READY message to PF to check if its alive
*/ staticint nicvf_register_misc_interrupt(struct nicvf *nic)
{ int ret = 0; int irq = NICVF_INTR_ID_MISC;
/* Return if mailbox interrupt is already registered */ if (nic->pdev->msix_enabled) return 0;
/* Enable MSI-X */
nic->num_vec = pci_msix_vec_count(nic->pdev);
ret = pci_alloc_irq_vectors(nic->pdev, nic->num_vec, nic->num_vec,
PCI_IRQ_MSIX); if (ret < 0) {
netdev_err(nic->netdev, "Req for #%d msix vectors failed\n", nic->num_vec); return ret;
}
/* Check if VF is able to communicate with PF */ if (!nicvf_check_pf_ready(nic)) {
nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0);
nicvf_unregister_interrupts(nic); return -EIO;
}
/* Check for minimum packet length */ if (skb->len <= ETH_HLEN) {
dev_kfree_skb(skb); return NETDEV_TX_OK;
}
/* In XDP case, initial HW tx queues are used for XDP, * but stack's queue mapping starts at '0', so skip the * Tx queues attached to Rx queues for XDP.
*/ if (nic->xdp_prog)
qid += nic->xdp_tx_queues;
int nicvf_stop(struct net_device *netdev)
{ int irq, qidx; struct nicvf *nic = netdev_priv(netdev); struct queue_set *qs = nic->qs; struct nicvf_cq_poll *cq_poll = NULL; union nic_mbx mbx = {};
/* wait till all queued set_rx_mode tasks completes */ if (nic->nicvf_rx_mode_wq) {
cancel_delayed_work_sync(&nic->link_change_work);
drain_workqueue(nic->nicvf_rx_mode_wq);
}
/* Check if we got MAC address from PF or else generate a radom MAC */ if (!nic->sqs_mode && is_zero_ether_addr(netdev->dev_addr)) {
eth_hw_addr_random(netdev);
nicvf_hw_set_mac_addr(nic, netdev);
}
if (nic->set_mac_pending) {
nic->set_mac_pending = false;
nicvf_hw_set_mac_addr(nic, netdev);
}
/* Configure receive side scaling and MTU */ if (!nic->sqs_mode) {
nicvf_rss_init(nic);
err = nicvf_update_hw_max_frs(nic, netdev->mtu); if (err) goto cleanup;
/* For now just support only the usual MTU sized frames, * plus some headroom for VLAN, QinQ.
*/ if (nic->xdp_prog && new_mtu > MAX_XDP_MTU) {
netdev_warn(netdev, "Jumbo frames not yet supported with XDP, current MTU %d.\n",
netdev->mtu); return -EINVAL;
}
if (netif_running(netdev) && nicvf_update_hw_max_frs(nic, new_mtu)) return -EINVAL;
/* For now just support only the usual MTU sized frames, * plus some headroom for VLAN, QinQ.
*/ if (prog && dev->mtu > MAX_XDP_MTU) {
netdev_warn(dev, "Jumbo frames not yet supported with XDP, current MTU %d.\n",
dev->mtu); return -EOPNOTSUPP;
}
/* ALL SQs attached to CQs i.e same as RQs, are treated as * XDP Tx queues and more Tx queues are allocated for * network stack to send pkts out. * * No of Tx queues are either same as Rx queues or whatever * is left in max no of queues possible.
*/ if ((nic->rx_queues + nic->tx_queues) > nic->max_queues) {
netdev_warn(dev, "Failed to attach BPF prog, RXQs + TXQs > Max %d\n",
nic->max_queues); return -ENOMEM;
}
if (if_up)
nicvf_stop(nic->netdev);
old_prog = xchg(&nic->xdp_prog, prog); /* Detach old prog, if any */ if (old_prog)
bpf_prog_put(old_prog);
if (nic->xdp_prog) { /* Attach BPF program */
bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1);
bpf_attached = true;
}
/* Calculate Tx queues needed for XDP and network stack */
nicvf_set_xdp_queues(nic, bpf_attached);
/* To avoid checks while retrieving buffer address from CQE_RX, * do not support XDP for T88 pass1.x silicons which are anyway * not in use widely.
*/ if (pass1_silicon(nic->pdev)) return -EOPNOTSUPP;
if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) return -EFAULT;
switch (config.tx_type) { case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: break; default: return -ERANGE;
}
switch (config.rx_filter) { case HWTSTAMP_FILTER_NONE:
nic->hw_rx_tstamp = false; break; case HWTSTAMP_FILTER_ALL: case HWTSTAMP_FILTER_SOME: case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
nic->hw_rx_tstamp = true;
config.rx_filter = HWTSTAMP_FILTER_ALL; break; default: return -ERANGE;
}
if (netif_running(netdev))
nicvf_config_hw_rx_tstamp(nic, nic->hw_rx_tstamp);
if (copy_to_user(ifr->ifr_data, &config, sizeof(config))) return -EFAULT;
staticvoid __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, struct nicvf *nic)
{ union nic_mbx mbx = {}; int idx;
/* From the inside of VM code flow we have only 128 bits memory * available to send message to host's PF, so send all mc addrs * one by one, starting from flush command in case if kernel * requests to configure specific MAC filtering
*/
/* flush DMAC filters and reset RX mode */
mbx.xcast.msg = NIC_MBOX_MSG_RESET_XCAST; if (nicvf_send_msg_to_pf(nic, &mbx) < 0) goto free_mc;
if (mode & BGX_XCAST_MCAST_FILTER) { /* once enabling filtering, we need to signal to PF to add * its' own LMAC to the filter to accept packets for it.
*/
mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
mbx.xcast.mac = 0; if (nicvf_send_msg_to_pf(nic, &mbx) < 0) goto free_mc;
}
/* check if we have any specific MACs to be added to PF DMAC filter */ if (mc_addrs) { /* now go through kernel list of MACs and add them one by one */ for (idx = 0; idx < mc_addrs->count; idx++) {
mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
mbx.xcast.mac = mc_addrs->mc[idx]; if (nicvf_send_msg_to_pf(nic, &mbx) < 0) goto free_mc;
}
}
/* and finally set rx mode for PF accordingly */
mbx.xcast.msg = NIC_MBOX_MSG_SET_XCAST;
mbx.xcast.mode = mode;
/* Save message data locally to prevent them from * being overwritten by next ndo_set_rx_mode call().
*/
spin_lock_bh(&nic->rx_mode_wq_lock);
mode = vf_work->mode;
mc = vf_work->mc;
vf_work->mc = NULL;
spin_unlock_bh(&nic->rx_mode_wq_lock);
ptp_clock = cavium_ptp_get(); if (IS_ERR(ptp_clock)) { if (PTR_ERR(ptp_clock) == -ENODEV) /* In virtualized environment we proceed without ptp */
ptp_clock = NULL; else return PTR_ERR(ptp_clock);
}
err = pci_enable_device(pdev); if (err) return dev_err_probe(dev, err, "Failed to enable PCI device\n");
err = pci_request_regions(pdev, DRV_NAME); if (err) {
dev_err(dev, "PCI request regions failed 0x%x\n", err); goto err_disable_device;
}
err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48)); if (err) {
dev_err(dev, "Unable to get usable DMA configuration\n"); goto err_release_regions;
}
qcount = netif_get_num_default_rss_queues();
/* Restrict multiqset support only for host bound VFs */ if (pdev->is_virtfn) { /* Set max number of queues per VF */
qcount = min_t(int, num_online_cpus(),
(MAX_SQS_PER_VF + 1) * MAX_CMP_QUEUES_PER_QS);
}
nic = netdev_priv(netdev);
nic->netdev = netdev;
nic->pdev = pdev;
nic->pnicvf = nic;
nic->max_queues = qcount; /* If no of CPUs are too low, there won't be any queues left * for XDP_TX, hence double it.
*/ if (!nic->t88)
nic->max_queues *= 2;
nic->ptp_clock = ptp_clock;
/* Initialize mutex that serializes usage of VF's mailbox */
mutex_init(&nic->rx_mode_mtx);
nic = netdev_priv(netdev);
pnetdev = nic->pnicvf->netdev;
/* Check if this Qset is assigned to different VF. * If yes, clean primary and all secondary Qsets.
*/ if (pnetdev && (pnetdev->reg_state == NETREG_REGISTERED))
unregister_netdev(pnetdev); if (nic->nicvf_rx_mode_wq) {
destroy_workqueue(nic->nicvf_rx_mode_wq);
nic->nicvf_rx_mode_wq = NULL;
}
nicvf_unregister_interrupts(nic);
pci_set_drvdata(pdev, NULL); if (nic->drv_stats)
free_percpu(nic->drv_stats);
cavium_ptp_put(nic->ptp_clock);
free_netdev(netdev);
pci_release_regions(pdev);
pci_disable_device(pdev);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.