/* Separating two types of XDP xmit */ #define VIRTIO_XDP_TX BIT(0) #define VIRTIO_XDP_REDIR BIT(1)
/* RX packet size EWMA. The average packet size is used to determine the packet * buffer size when refilling RX rings. As the entire RX ring may be refilled * at once, the weight is chosen so that the EWMA will be insensitive to short- * term, transient changes in packet size.
*/
DECLARE_EWMA(pkt_len, 0, 64)
#define VIRTNET_SQ_STAT(name, m) {name, offsetof(struct virtnet_sq_stats, m), -1} #define VIRTNET_RQ_STAT(name, m) {name, offsetof(struct virtnet_rq_stats, m), -1}
#define VIRTNET_SQ_STAT_QSTAT(name, m) \
{ \
name, \
offsetof(struct virtnet_sq_stats, m), \
offsetof(struct netdev_queue_stats_tx, m), \
}
#define VIRTNET_RQ_STAT_QSTAT(name, m) \
{ \
name, \
offsetof(struct virtnet_rq_stats, m), \
offsetof(struct netdev_queue_stats_rx, m), \
}
/* failover when STANDBY feature enabled */ struct failover *failover;
u64 device_stats_cap;
};
struct padded_vnet_hdr { struct virtio_net_hdr_v1_hash hdr; /* * hdr is in a separate sg buffer, and data sg buffer shares same page * with this header sg. This padding makes next sg 16 byte aligned * after the header.
*/ char padding[12];
};
/* * private is used to chain pages for big packets, put the whole * most recent used list in the beginning for reuse
*/ staticvoid give_pages(struct receive_queue *rq, struct page *page)
{ struct page *end;
/* Find end of list, sew whole thing into vi->rq.pages. */ for (end = page; end->private; end = (struct page *)end->private);
end->private = (unsignedlong)rq->pages;
rq->pages = page;
}
if (p) {
rq->pages = (struct page *)p->private; /* clear private here, it is used to chain pages */
p->private = 0;
} else
p = alloc_page(gfp_mask); return p;
}
/* Suppress further interrupts. */
virtqueue_disable_cb(vq);
if (napi->weight)
virtqueue_napi_schedule(napi, vq); else /* We were probably waiting for more output buffers. */
netif_wake_subqueue(vi->dev, vq2txq(vq));
}
/* copy small packet so we can reuse these pages for small data */
skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN); if (unlikely(!skb)) return NULL;
/* Copy all frame if it fits skb->head, otherwise * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
*/ if (len <= skb_tailroom(skb))
copy = len; else
copy = ETH_HLEN;
skb_put_data(skb, p, copy);
len -= copy;
offset += copy;
if (vi->mergeable_rx_bufs) { if (len)
skb_add_rx_frag(skb, 0, page, offset, len, truesize); else
page_to_free = page; goto ok;
}
/* new pages */ if (!alloc_frag->offset) { if (rq->last_dma) { /* Now, the new page is allocated, the last dma * will not be used. So the dma can be unmapped * if the ref is 0.
*/
virtnet_rq_unmap(rq, rq->last_dma, 0);
rq->last_dma = NULL;
}
/* Add a reference to dma to prevent the entire dma from * being released during error handling. This reference * will be freed after the pages are no longer used.
*/
get_page(alloc_frag->page);
dma->ref = 1;
alloc_frag->offset = sizeof(*dma);
/* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit.
*/ if (!stats.packets && !stats.napi_packets) return;
/* If running out of space, stop queue to avoid getting packets that we * are then unable to transmit. * An alternative would be to force queuing layer to requeue the skb by * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be * returned in a normal path of operation: it means that driver is not * maintaining the TX queue stop/start state properly, and causes * the stack to do a non-trivial amount of useless work. * Since most packets only take 1 or 2 ring slots, stopping the queue * early means 16 slots are typically wasted.
*/ if (sq->vq->num_free < MAX_SKB_FRAGS + 2) { struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
if (use_napi) { if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
virtqueue_napi_schedule(&sq->napi, sq->vq);
} elseif (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */
free_old_xmit(sq, txq, false); if (sq->vq->num_free >= MAX_SKB_FRAGS + 2) {
netif_start_subqueue(dev, qnum);
u64_stats_update_begin(&sq->stats.syncp);
u64_stats_inc(&sq->stats.wake);
u64_stats_update_end(&sq->stats.syncp);
virtqueue_disable_cb(sq->vq);
}
}
}
}
/* Note that @len is the length of received data without virtio header */ staticstruct xdp_buff *buf_to_xdp(struct virtnet_info *vi, struct receive_queue *rq, void *buf,
u32 len, bool first_buf)
{ struct xdp_buff *xdp;
u32 bufsize;
xdp = (struct xdp_buff *)buf;
/* In virtnet_add_recvbuf_xsk, we use part of XDP_PACKET_HEADROOM for * virtio header and ask the vhost to fill data from * hard_start + XDP_PACKET_HEADROOM - vi->hdr_len * The first buffer has virtio header so the remaining region for frame * data is * xsk_pool_get_rx_frame_size() * While other buffers than the first one do not have virtio header, so * the maximum frame data's length can be * xsk_pool_get_rx_frame_size() + vi->hdr_len
*/
bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool); if (!first_buf)
bufsize += vi->hdr_len;
ret = XDP_PASS;
rcu_read_lock();
prog = rcu_dereference(rq->xdp_prog); if (prog) { /* TODO: support multi buffer. */ if (num_buf == 1)
ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit,
stats); else
ret = XDP_ABORTED;
}
rcu_read_unlock();
switch (ret) { case XDP_PASS:
skb = xsk_construct_skb(rq, xdp); if (!skb) goto drop_bufs;
num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free); if (!num) return -ENOMEM;
len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
for (i = 0; i < num; ++i) { /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space. * We assume XDP_PACKET_HEADROOM is larger than hdr->len. * (see function virtnet_xsk_pool_enable)
*/
addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len;
nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); if (!nb_pkts) return 0;
for (i = 0; i < nb_pkts; i++) {
err = virtnet_xsk_xmit_one(sq, pool, &descs[i]); if (unlikely(err)) {
xsk_tx_completed(sq->xsk_pool, nb_pkts - i); break;
}
kick = true;
}
if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
(*kicks)++;
staticvoid virtnet_xsk_completed(struct send_queue *sq, int num)
{
xsk_tx_completed(sq->xsk_pool, num);
/* If this is called by rx poll, start_xmit and xdp xmit we should * wakeup the tx napi to consume the xsk tx queue, because the tx * interrupt may not be triggered.
*/
xsk_wakeup(sq);
}
if (unlikely(xdpf->headroom < vi->hdr_len)) return -EOVERFLOW;
if (unlikely(xdp_frame_has_frags(xdpf))) {
shinfo = xdp_get_shared_info_from_frame(xdpf);
nr_frags = shinfo->nr_frags;
}
/* In wrapping function virtnet_xdp_xmit(), we need to free * up the pending old buffers, where we need to calculate the * position of skb_shared_info in xdp_get_frame_len() and * xdp_return_frame(), which will involve to xdpf->data and * xdpf->headroom. Therefore, we need to update the value of * headroom synchronously here.
*/
xdpf->headroom -= vi->hdr_len;
xdpf->data -= vi->hdr_len; /* Zero header and leave csum up to XDP layers */
hdr = xdpf->data;
memset(hdr, 0, vi->hdr_len);
xdpf->len += vi->hdr_len;
sg_init_table(sq->sg, nr_frags + 1);
sg_set_buf(sq->sg, xdpf->data, xdpf->len); for (i = 0; i < nr_frags; i++) {
skb_frag_t *frag = &shinfo->frags[i];
/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on * the current cpu, so it does not need to be locked. * * Here we use marco instead of inline functions because we have to deal with * three issues at the same time: 1. the choice of sq. 2. judge and execute the * lock/unlock of txq 3. make sparse happy. It is difficult for two inline * functions to perfectly solve these three problems at the same time.
*/ #define virtnet_xdp_get_sq(vi) ({ \ int cpu = smp_processor_id(); \ struct netdev_queue *txq; \
typeof(vi) v = (vi); \ unsignedint qp; \
\ if (v->curr_queue_pairs > nr_cpu_ids) { \
qp = v->curr_queue_pairs - v->xdp_queue_pairs; \
qp += cpu; \
txq = netdev_get_tx_queue(v->dev, qp); \
__netif_tx_acquire(txq); \
} else { \
qp = cpu % v->curr_queue_pairs; \
txq = netdev_get_tx_queue(v->dev, qp); \
__netif_tx_lock(txq, cpu); \
} \
v->sq + qp; \
})
staticint virtnet_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags)
{ struct virtnet_info *vi = netdev_priv(dev); struct virtnet_sq_free_stats stats = {0}; struct receive_queue *rq = vi->rq; struct bpf_prog *xdp_prog; struct send_queue *sq; int nxmit = 0; int kicks = 0; int ret; int i;
/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this * indicate XDP resources have been successfully allocated.
*/
xdp_prog = rcu_access_pointer(rq->xdp_prog); if (!xdp_prog) return -ENXIO;
sq = virtnet_xdp_get_sq(vi);
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
ret = -EINVAL; goto out;
}
/* Free up any pending old buffers before queueing new ones. */
virtnet_free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), false, &stats);
for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i];
if (__virtnet_xdp_xmit_one(vi, sq, xdpf)) break;
nxmit++;
}
ret = nxmit;
if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
check_sq_full_and_disable(vi, dev, sq);
if (flags & XDP_XMIT_FLUSH) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
kicks = 1;
}
out:
u64_stats_update_begin(&sq->stats.syncp);
u64_stats_add(&sq->stats.bytes, stats.bytes);
u64_stats_add(&sq->stats.packets, stats.packets);
u64_stats_add(&sq->stats.xdp_tx, n);
u64_stats_add(&sq->stats.xdp_tx_drops, n - nxmit);
u64_stats_add(&sq->stats.kicks, kicks);
u64_stats_update_end(&sq->stats.syncp);
case XDP_TX:
u64_stats_inc(&stats->xdp_tx);
xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) {
netdev_dbg(dev, "convert buff to frame failed for xdp\n"); return XDP_DROP;
}
/* We copy the packet for XDP in the following cases: * * 1) Packet is scattered across multiple rx buffers. * 2) Headroom space is insufficient. * * This is inefficient but it's a temporary condition that * we hit right after XDP is enabled and until queue is refilled * with large buffers with sufficient headroom - so it should affect * at most queue size packets. * Afterwards, the conditions to enable * XDP should preclude the underlying device from sending packets * across multiple buffers (num_buf > 1), and we make sure buffers * have enough headroom.
*/ staticstruct page *xdp_linearize_page(struct net_device *dev, struct receive_queue *rq, int *num_buf, struct page *p, int offset, int page_off, unsignedint *len)
{ int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct page *page;
if (page_off + *len + tailroom > PAGE_SIZE) return NULL;
page = alloc_page(GFP_ATOMIC); if (!page) return NULL;
/* Only mergeable mode can go inside this while loop. In small mode, * *num_buf == 1, so it cannot go inside.
*/ while (--*num_buf) { unsignedint buflen; void *buf; void *ctx; int off;
buf = virtnet_rq_get_buf(rq, &buflen, &ctx); if (unlikely(!buf)) goto err_buf;
p = virt_to_head_page(buf);
off = buf - page_address(p);
if (check_mergeable_len(dev, ctx, buflen)) {
put_page(p); goto err_buf;
}
/* guard against a misconfigured or uncooperative backend that * is sending packet larger than the MTU.
*/ if ((page_off + buflen + tailroom) > PAGE_SIZE) {
put_page(p); goto err_buf;
}
switch (act) { case XDP_PASS: /* Recalculate length in case bpf program changed it */
len = xdp.data_end - xdp.data;
metasize = xdp.data - xdp.data_meta; break;
/* Make sure that len does not exceed the size allocated in * add_recvbuf_big.
*/ if (unlikely(len > (vi->big_packets_num_skbfrags + 1) * PAGE_SIZE)) {
pr_debug("%s: rx error: len %u exceeds allocated size %lu\n",
dev->name, len,
(vi->big_packets_num_skbfrags + 1) * PAGE_SIZE); goto err;
}
skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0);
u64_stats_add(&stats->bytes, len - vi->hdr_len); if (unlikely(!skb)) goto err;
/* Why not use xdp_build_skb_from_frame() ? * XDP core assumes that xdp frags are PAGE_SIZE in length, while in * virtio-net there are 2 points that do not match its requirements: * 1. The size of the prefilled buffer is not fixed before xdp is set. * 2. xdp_build_skb_from_frame() does more checks that we don't need, * like eth_type_trans() (which virtio-net does in receive_buf()).
*/ staticstruct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, struct virtnet_info *vi, struct xdp_buff *xdp, unsignedint xdp_frags_truesz)
{ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); unsignedint headroom, data_len; struct sk_buff *skb; int metasize;
u8 nr_frags;
if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
pr_debug("Error building skb as missing reserved tailroom for xdp"); return NULL;
}
if (unlikely(xdp_buff_has_frags(xdp)))
nr_frags = sinfo->nr_frags;
skb = build_skb(xdp->data_hard_start, xdp->frame_sz); if (unlikely(!skb)) return NULL;
if (*num_buf > 1) { /* If we want to build multi-buffer xdp, we need * to specify that the flags of xdp_buff have the * XDP_FLAGS_HAS_FRAG bit.
*/ if (!xdp_buff_has_frags(xdp))
xdp_buff_set_frags_flag(xdp);
/* Transient failure which in theory could occur if * in-flight packets from before XDP was enabled reach * the receive path after XDP is loaded.
*/ if (unlikely(hdr->hdr.gso_type)) return NULL;
/* Partially checksummed packets must be dropped. */ if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return NULL;
/* Now XDP core assumes frag size is PAGE_SIZE, but buffers * with headroom may add hole in truesize, which * make their length exceed PAGE_SIZE. So we disabled the * hole mechanism for xdp. See add_recvbuf_mergeable().
*/
*frame_sz = truesize;
/* This happens when headroom is not enough because * of the buffer was prefilled before XDP is set. * This should only happen for the first several packets. * In fact, vq reset can be used here to help us clean up * the prefilled buffers, but many existing devices do not * support it, and we don't want to bother users who are * using xdp normally.
*/ if (!xdp_prog->aux->xdp_has_frags) { /* linearize data for XDP */
xdp_page = xdp_linearize_page(vi->dev, rq, num_buf,
*page, offset,
XDP_PACKET_HEADROOM,
len); if (!xdp_page) return NULL;
} else {
xdp_room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + sizeof(struct skb_shared_info)); if (*len + xdp_room > PAGE_SIZE) return NULL;
xdp_page = alloc_page(GFP_ATOMIC); if (!xdp_page) return NULL;
switch (__le16_to_cpu(hdr_hash->hash_report)) { case VIRTIO_NET_HASH_REPORT_TCPv4: case VIRTIO_NET_HASH_REPORT_UDPv4: case VIRTIO_NET_HASH_REPORT_TCPv6: case VIRTIO_NET_HASH_REPORT_UDPv6: case VIRTIO_NET_HASH_REPORT_TCPv6_EX: case VIRTIO_NET_HASH_REPORT_UDPv6_EX:
rss_hash_type = PKT_HASH_TYPE_L4; break; case VIRTIO_NET_HASH_REPORT_IPv4: case VIRTIO_NET_HASH_REPORT_IPv6: case VIRTIO_NET_HASH_REPORT_IPv6_EX:
rss_hash_type = PKT_HASH_TYPE_L3; break; case VIRTIO_NET_HASH_REPORT_NONE: default:
rss_hash_type = PKT_HASH_TYPE_NONE;
}
skb_set_hash(skb, virtio_net_hash_value(hdr_hash), rss_hash_type);
}
if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
pr_debug("%s: short packet %i\n", dev->name, len);
DEV_STATS_INC(dev, rx_length_errors);
virtnet_rq_free_buf(vi, rq, buf); return;
}
/* About the flags below: * 1. Save the flags early, as the XDP program might overwrite them. * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID * stay valid after XDP processing. * 2. XDP doesn't work with partially checksummed packets (refer to * virtnet_xdp_set()), so packets marked as * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing.
*/
/* Unlike mergeable buffers, all buffers are allocated to the * same size, except for the headroom. For this reason we do * not need to use mergeable_len_to_ctx here - it is enough * to store the headroom as the context ignoring the truesize.
*/ staticint add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
gfp_t gfp)
{ char *buf; unsignedint xdp_headroom = virtnet_get_headroom(vi); void *ctx = (void *)(unsignedlong)xdp_headroom; int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom; int err;
len = SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
if (unlikely(!skb_page_frag_refill(len, &rq->alloc_frag, gfp))) return -ENOMEM;
buf = virtnet_rq_alloc(rq, len, gfp); if (unlikely(!buf)) return -ENOMEM;
/* page in rq->sg[vi->big_packets_num_skbfrags + 1] is list tail */ for (i = vi->big_packets_num_skbfrags + 1; i > 1; --i) {
first = get_a_page(rq, gfp); if (!first) { if (list)
give_pages(rq, list); return -ENOMEM;
}
sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
/* chain new page in list head to match sg */
first->private = (unsignedlong)list;
list = first;
}
first = get_a_page(rq, gfp); if (!first) {
give_pages(rq, list); return -ENOMEM;
}
p = page_address(first);
/* rq->sg[0], rq->sg[1] share the same page */ /* a separated rq->sg[0] for header - required in case !any_header_sg */
sg_set_buf(&rq->sg[0], p, vi->hdr_len);
/* rq->sg[1] for data packet, from offset */
offset = sizeof(struct padded_vnet_hdr);
sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
/* chain first in list head */
first->private = (unsignedlong)list;
err = virtqueue_add_inbuf(rq->vq, rq->sg, vi->big_packets_num_skbfrags + 2,
first, gfp); if (err < 0)
give_pages(rq, first);
/* Extra tailroom is needed to satisfy XDP's assumption. This * means rx frags coalescing won't work, but consider we've * disabled GSO for XDP, it won't be a big issue.
*/
len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) return -ENOMEM;
if (!alloc_frag->offset && len + room + sizeof(struct virtnet_rq_dma) > alloc_frag->size)
len -= sizeof(struct virtnet_rq_dma);
buf = virtnet_rq_alloc(rq, len + room, gfp); if (unlikely(!buf)) return -ENOMEM;
buf += headroom; /* advance address leaving hole at front of pkt */
hole = alloc_frag->size - alloc_frag->offset; if (hole < len + room) { /* To avoid internal fragmentation, if there is very likely not * enough space for another buffer, add the remaining space to * the current buffer. * XDP core assumes that frame_size of xdp_buff and the length * of the frag are PAGE_SIZE, so we disable the hole mechanism.
*/ if (!headroom)
len += hole;
alloc_frag->offset += hole;
}
/* * Returns false if we couldn't fill entirely (OOM). * * Normally run in the receive path, but can also be run from ndo_open * before we're receiving packets, or from refill_work which is * careful to disable receiving (using napi_disable).
*/ staticbool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
gfp_t gfp)
{ int err;
/* If all buffers were filled by other side before we napi_enabled, we * won't get another interrupt, so process any outstanding packets now. * Call local_bh_enable after to trigger softIRQ processing.
*/
local_bh_disable();
virtqueue_napi_schedule(napi, vq);
local_bh_enable();
}
/* Tx napi touches cachelines on the cpu handling tx interrupts. Only * enable the feature if this is likely affine with the transmit path.
*/ if (!vi->affinity_hint_set) {
napi->weight = 0; return;
}
for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i];
/* * When queue API support is added in the future and the call * below becomes napi_disable_locked, this driver will need to * be refactored. * * One possible solution would be to: * - cancel refill_work with cancel_delayed_work (note: * non-sync) * - cancel refill_work with cancel_delayed_work_sync in * virtnet_remove after the netdev is unregistered * - wrap all of the work in a lock (perhaps the netdev * instance lock) * - check netif_running() and return early to avoid a race
*/
napi_disable(&rq->napi);
still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
virtnet_napi_do_enable(rq->vq, &rq->napi);
/* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again.
*/ if (still_empty)
schedule_delayed_work(&vi->refill, HZ/2);
}
}
/* Don't need protection when fetching stats, since fetcher and * updater of the stats are in same context
*/
dim_update_sample(rq->calls,
u64_stats_read(&rq->stats.packets),
u64_stats_read(&rq->stats.bytes),
&cur_sample);
received = virtnet_receive(rq, budget, &xdp_xmit);
rq->packets_in_napi += received;
if (xdp_xmit & VIRTIO_XDP_REDIR)
xdp_do_flush();
/* Out of packets? */ if (received < budget) {
napi_complete = virtqueue_napi_complete(napi, rq->vq, received); /* Intentionally not taking dim_lock here. This may result in a * spurious net_dim call. But if that happens virtnet_rx_dim_work * will not act on the scheduled work.
*/ if (napi_complete && rq->dim_enabled)
virtnet_rx_dim_update(vi, rq);
}
if (xdp_xmit & VIRTIO_XDP_TX) {
sq = virtnet_xdp_get_sq(vi); if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
u64_stats_update_begin(&sq->stats.syncp);
u64_stats_inc(&sq->stats.kicks);
u64_stats_update_end(&sq->stats.syncp);
}
virtnet_xdp_put_sq(vi, sq);
}
if (ethtool_validate_duplex(duplex))
vi->duplex = duplex;
}
staticint virtnet_open(struct net_device *dev)
{ struct virtnet_info *vi = netdev_priv(dev); int i, err;
enable_delayed_refill(vi);
for (i = 0; i < vi->max_queue_pairs; i++) { if (i < vi->curr_queue_pairs) /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
if (running) {
virtnet_napi_disable(rq);
virtnet_cancel_dim(vi, &rq->dim);
}
}
staticvoid virtnet_rx_pause_all(struct virtnet_info *vi)
{ int i;
/* * Make sure refill_work does not run concurrently to * avoid napi_disable race which leads to deadlock.
*/
disable_delayed_refill(vi);
cancel_delayed_work_sync(&vi->refill); for (i = 0; i < vi->max_queue_pairs; i++)
__virtnet_rx_pause(vi, &vi->rq[i]);
}
staticvoid virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
{ /* * Make sure refill_work does not run concurrently to * avoid napi_disable race which leads to deadlock.
*/
disable_delayed_refill(vi);
cancel_delayed_work_sync(&vi->refill);
__virtnet_rx_pause(vi, rq);
}
if (refill && !try_fill_recv(vi, rq, GFP_KERNEL))
schedule_refill = true; if (running)
virtnet_napi_enable(rq);
if (schedule_refill)
schedule_delayed_work(&vi->refill, 0);
}
staticvoid virtnet_rx_resume_all(struct virtnet_info *vi)
{ int i;
enable_delayed_refill(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (i < vi->curr_queue_pairs)
__virtnet_rx_resume(vi, &vi->rq[i], true); else
__virtnet_rx_resume(vi, &vi->rq[i], false);
}
}
BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC); if (ret < 0) {
dev_warn(&vi->vdev->dev, "Failed to add sgs for command vq: %d\n.", ret);
mutex_unlock(&vi->cvq_lock); returnfalse;
}
if (unlikely(!virtqueue_kick(vi->cvq))) goto unlock;
/* Spin for a response, the kick causes an ioport write, trapping * into the hypervisor, so the request should be handled immediately.
*/ while (!virtqueue_get_buf(vi->cvq, &tmp) &&
!virtqueue_is_broken(vi->cvq)) {
cond_resched();
cpu_relax();
}
unlock:
ok = vi->ctrl->status == VIRTIO_NET_OK;
mutex_unlock(&vi->cvq_lock); return ok;
}
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) return -EOPNOTSUPP;
addr = kmemdup(p, sizeof(*addr), GFP_KERNEL); if (!addr) return -ENOMEM;
ret = eth_prepare_mac_addr_change(dev, addr); if (ret) goto out;
if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
sg_init_one(&sg, addr->sa_data, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
dev_warn(&vdev->dev, "Failed to set mac address by vq command.\n");
ret = -EINVAL; goto out;
}
} elseif (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { unsignedint i;
/* Naturally, this has an atomicity problem. */ for (i = 0; i < dev->addr_len; i++)
virtio_cwrite8(vdev,
offsetof(struct virtio_net_config, mac) +
i, addr->sa_data[i]);
}
if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0;
/* Firstly check if we need update rss. Do updating if both (1) rss enabled and * (2) no user configuration. * * During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, * the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs * update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly.
*/ if (vi->has_rss && !netif_is_rxfh_configured(dev)) {
old_rss_hdr = vi->rss_hdr;
old_rss_trailer = vi->rss_trailer;
vi->rss_hdr = devm_kzalloc(&dev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); if (!vi->rss_hdr) {
vi->rss_hdr = old_rss_hdr; return -ENOMEM;
}
if (!virtnet_commit_rss_command(vi)) { /* restore ctrl_rss if commit_rss_command failed */
devm_kfree(&dev->dev, vi->rss_hdr);
vi->rss_hdr = old_rss_hdr;
vi->rss_trailer = old_rss_trailer;
dev_warn(&dev->dev, "Fail to set num of queue pairs to %d, because committing RSS failed\n",
queue_pairs); return -EINVAL;
}
devm_kfree(&dev->dev, old_rss_hdr); goto succ;
}
mq = kzalloc(sizeof(*mq), GFP_KERNEL); if (!mq) return -ENOMEM;
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
queue_pairs); return -EINVAL;
}
succ:
vi->curr_queue_pairs = queue_pairs; /* virtnet_open() will refill when device is going to up. */
spin_lock_bh(&vi->refill_lock); if (dev->flags & IFF_UP && vi->refill_enabled)
schedule_delayed_work(&vi->refill, 0);
spin_unlock_bh(&vi->refill_lock);
/* Make sure NAPI doesn't schedule refill work */
disable_delayed_refill(vi); /* Make sure refill_work doesn't re-enable napi! */
cancel_delayed_work_sync(&vi->refill); /* Prevent the config change callback from changing carrier * after close
*/
virtio_config_driver_disable(vi->vdev); /* Stop getting status/speed updates: we don't care until next * open
*/
cancel_work_sync(&vi->config_work);
for (i = 0; i < vi->max_queue_pairs; i++) {
virtnet_disable_queue_pair(vi, i);
virtnet_cancel_dim(vi, &vi->rq[i].dim);
}
/* We can't dynamically set ndo_set_rx_mode, so return gracefully */ if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) return;
promisc_allmulti = kzalloc(sizeof(*promisc_allmulti), GFP_KERNEL); if (!promisc_allmulti) {
dev_warn(&dev->dev, "Failed to set RX mode, no memory.\n"); return;
}
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
*promisc_allmulti ? "en" : "dis");
netif_addr_lock_bh(dev);
uc_count = netdev_uc_count(dev);
mc_count = netdev_mc_count(dev); /* MAC filter - use one buffer for both lists */
buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
(2 * sizeof(mac_data->entries)), GFP_ATOMIC);
mac_data = buf; if (!buf) {
netif_addr_unlock_bh(dev);
rtnl_unlock(); return;
}
sg_init_table(sg, 2);
/* Store the unicast list and count in the front of the buffer */
mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
i = 0;
netdev_for_each_uc_addr(ha, dev)
memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
VIRTIO_NET_CTRL_VLAN_DEL, &sg))
dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); return 0;
}
staticvoid virtnet_clean_affinity(struct virtnet_info *vi)
{ int i;
if (vi->affinity_hint_set) { for (i = 0; i < vi->max_queue_pairs; i++) {
virtqueue_set_affinity(vi->rq[i].vq, NULL);
virtqueue_set_affinity(vi->sq[i].vq, NULL);
}
vi->affinity_hint_set = false;
}
}
staticvoid virtnet_set_affinity(struct virtnet_info *vi)
{
cpumask_var_t mask; int stragglers; int group_size; int i, start = 0, cpu; int num_cpu; int stride;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
virtnet_clean_affinity(vi); return;
}
if (ring->rx_pending == rx_pending &&
ring->tx_pending == tx_pending) return 0;
if (ring->rx_pending > vi->rq[0].vq->num_max) return -EINVAL;
if (ring->tx_pending > vi->sq[0].vq->num_max) return -EINVAL;
for (i = 0; i < vi->max_queue_pairs; i++) {
rq = vi->rq + i;
sq = vi->sq + i;
if (ring->tx_pending != tx_pending) {
err = virtnet_tx_resize(vi, sq, ring->tx_pending); if (err) return err;
/* Upon disabling and re-enabling a transmit virtqueue, the device must * set the coalescing parameters of the virtqueue to those configured * through the VIRTIO_NET_CTRL_NOTF_COAL_TX_SET command, or, if the driver * did not set any TX coalescing parameters, to 0.
*/
err = virtnet_send_tx_ctrl_coal_vq_cmd(vi, i,
vi->intr_coal_tx.max_usecs,
vi->intr_coal_tx.max_packets);
/* Don't break the tx resize action if the vq coalescing is not * supported. The same is true for rx resize below.
*/ if (err && err != -EOPNOTSUPP) return err;
}
if (ring->rx_pending != rx_pending) {
err = virtnet_rx_resize(vi, rq, ring->rx_pending); if (err) return err;
/* The reason is same as the transmit virtqueue reset */
mutex_lock(&vi->rq[i].dim_lock);
err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, i,
vi->intr_coal_rx.max_usecs,
vi->intr_coal_rx.max_packets);
mutex_unlock(&vi->rq[i].dim_lock); if (err && err != -EOPNOTSUPP) return err;
}
}
/* We don't support separate rx/tx channels. * We don't allow setting 'other' channels.
*/ if (channels->rx_count || channels->tx_count || channels->other_count) return -EINVAL;
if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0) return -EINVAL;
/* For now we don't support modifying channels while XDP is loaded * also when XDP is loaded all RX queues have XDP programs so we only * need to check a single RX queue.
*/ if (vi->rq[0].xdp_prog) return -EINVAL;
staticvoid virtnet_stats_sprintf(u8 **p, constchar *fmt, constchar *noq_fmt, int num, int qid, conststruct virtnet_stat_desc *desc)
{ int i;
if (qid < 0) { for (i = 0; i < num; ++i)
ethtool_sprintf(p, noq_fmt, desc[i].desc);
} else { for (i = 0; i < num; ++i)
ethtool_sprintf(p, fmt, qid, desc[i].desc);
}
}
/* qid == -1: for rx/tx queue total field */ staticvoid virtnet_get_stats_string(struct virtnet_info *vi, int type, int qid, u8 **data)
{ conststruct virtnet_stat_desc *desc; constchar *fmt, *noq_fmt;
u8 *p = *data;
u32 num;
/* stats_sum_queue - Calculate the sum of the same fields in sq or rq. * @sum: the position to store the sum values * @num: field num * @q_value: the first queue fields * @q_num: number of the queues
*/ staticvoid stats_sum_queue(u64 *sum, u32 num, u64 *q_value, u32 q_num)
{
u32 step = num; int i, j;
u64 *p;
for (i = 0; i < num; ++i) {
p = sum + i;
*p = 0;
for (j = 0; j < q_num; ++j)
*p += *(q_value + i + j * step);
}
}
if (drv_stats) { if (queue_type == VIRTNET_Q_TYPE_RX) {
desc = &virtnet_rq_stats_desc_qstat[0];
num = ARRAY_SIZE(virtnet_rq_stats_desc_qstat);
} else {
desc = &virtnet_sq_stats_desc_qstat[0];
num = ARRAY_SIZE(virtnet_sq_stats_desc_qstat);
}
for (i = 0; i < num; ++i) {
offset = desc[i].qstat_offset / sizeof(*ctx->data);
v_stat = (const u64_stats_t *)(base + desc[i].offset);
ctx->data[offset] = u64_stats_read(v_stat);
} return;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_RX_BASIC) {
desc = &virtnet_stats_rx_basic_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_rx_basic_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC) goto found;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_RX_CSUM) {
desc = &virtnet_stats_rx_csum_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_rx_csum_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM) goto found;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_RX_GSO) {
desc = &virtnet_stats_rx_gso_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_rx_gso_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_GSO) goto found;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_RX_SPEED) {
desc = &virtnet_stats_rx_speed_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_rx_speed_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED) goto found;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_TX_BASIC) {
desc = &virtnet_stats_tx_basic_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_tx_basic_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC) goto found;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_TX_CSUM) {
desc = &virtnet_stats_tx_csum_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_tx_csum_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_CSUM) goto found;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_TX_GSO) {
desc = &virtnet_stats_tx_gso_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_tx_gso_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO) goto found;
}
if (bitmap & VIRTIO_NET_STATS_TYPE_TX_SPEED) {
desc = &virtnet_stats_tx_speed_desc_qstat[0];
num = ARRAY_SIZE(virtnet_stats_tx_speed_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED) goto found;
}
return;
found: for (i = 0; i < num; ++i) {
offset = desc[i].qstat_offset / sizeof(*ctx->data);
v = (const __le64 *)(base + desc[i].offset);
ctx->data[offset] = le64_to_cpu(*v);
}
}
/* virtnet_fill_stats - copy the stats to qstats or ethtool -S * The stats source is the device or the driver. * * @vi: virtio net info * @qid: the vq id * @ctx: stats ctx (initiated by virtnet_stats_ctx_init()) * @base: pointer to the device reply or the driver stats structure. * @drv_stats: designate the base type (device reply, driver stats) * @type: the type of the device reply (if drv_stats is true, this must be zero)
*/ staticvoid virtnet_fill_stats(struct virtnet_info *vi, u32 qid, struct virtnet_stats_ctx *ctx, const u8 *base, bool drv_stats, u8 reply_type)
{
u32 queue_type, num_rx, num_tx, num_cq; conststruct virtnet_stat_desc *desc; const u64_stats_t *v_stat;
u64 offset, bitmap; const __le64 *v; int i, num;
if (ctx->to_qstat) return virtnet_fill_stats_qstat(vi, qid, ctx, base, drv_stats, reply_type);
/* qid: -1: get stats of all vq. * > 0: get the stats for the special vq. This must not be cvq.
*/ staticint virtnet_get_hw_stats(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, int qid)
{ int qnum, i, j, res_size, qtype, last_vq, first_vq; struct virtio_net_ctrl_queue_stats *req; bool enable_cvq; void *reply; int ok;
if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_DEVICE_STATS)) return 0;
switch (stringset) { case ETH_SS_STATS: /* Generate the total field names. */
virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_RX, -1, &p);
virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_TX, -1, &p);
if (rx_ctrl_dim_on && !vi->rx_dim_enabled) {
vi->rx_dim_enabled = true; for (i = 0; i < vi->max_queue_pairs; i++) {
mutex_lock(&vi->rq[i].dim_lock);
vi->rq[i].dim_enabled = true;
mutex_unlock(&vi->rq[i].dim_lock);
} return 0;
}
coal_rx = kzalloc(sizeof(*coal_rx), GFP_KERNEL); if (!coal_rx) return -ENOMEM;
if (!rx_ctrl_dim_on && vi->rx_dim_enabled) {
vi->rx_dim_enabled = false; for (i = 0; i < vi->max_queue_pairs; i++) {
mutex_lock(&vi->rq[i].dim_lock);
vi->rq[i].dim_enabled = false;
mutex_unlock(&vi->rq[i].dim_lock);
}
}
/* Since the per-queue coalescing params can be set, * we need apply the global new params even if they * are not updated.
*/
coal_rx->rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs);
coal_rx->rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames);
sg_init_one(&sgs_rx, coal_rx, sizeof(*coal_rx));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
VIRTIO_NET_CTRL_NOTF_COAL_RX_SET,
&sgs_rx)) return -EINVAL;
vi->intr_coal_rx.max_usecs = ec->rx_coalesce_usecs;
vi->intr_coal_rx.max_packets = ec->rx_max_coalesced_frames; for (i = 0; i < vi->max_queue_pairs; i++) {
mutex_lock(&vi->rq[i].dim_lock);
vi->rq[i].intr_coal.max_usecs = ec->rx_coalesce_usecs;
vi->rq[i].intr_coal.max_packets = ec->rx_max_coalesced_frames;
mutex_unlock(&vi->rq[i].dim_lock);
}
return 0;
}
staticint virtnet_send_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec)
{ int err;
err = virtnet_send_tx_notf_coal_cmds(vi, ec); if (err) return err;
err = virtnet_send_rx_notf_coal_cmds(vi, ec); if (err) return err;
if (!rx_ctrl_dim_on && cur_rx_dim)
vi->rq[queue].dim_enabled = false;
/* If no params are updated, userspace ethtool will * reject the modification.
*/
err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, queue,
ec->rx_coalesce_usecs,
ec->rx_max_coalesced_frames);
mutex_unlock(&vi->rq[queue].dim_lock); return err;
}
mutex_lock(&rq->dim_lock); if (!rq->dim_enabled) goto out;
update_moder = net_dim_get_rx_irq_moder(dev, dim); if (update_moder.usec != rq->intr_coal.max_usecs ||
update_moder.pkts != rq->intr_coal.max_packets) {
err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum,
update_moder.usec,
update_moder.pkts); if (err)
pr_debug("%s: Failed to send dim parameters on rxq%d\n",
dev->name, qnum);
}
out:
dim->state = DIM_START_MEASURE;
mutex_unlock(&rq->dim_lock);
}
staticint virtnet_coal_params_supported(struct ethtool_coalesce *ec)
{ /* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL * or VIRTIO_NET_F_VQ_NOTF_COAL feature is negotiated.
*/ if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs) return -EOPNOTSUPP;
if (ec->tx_max_coalesced_frames > 1 ||
ec->rx_max_coalesced_frames != 1) return -EINVAL;
return 0;
}
staticint virtnet_should_update_vq_weight(int dev_flags, int weight, int vq_weight, bool *should_update)
{ if (weight ^ vq_weight) { if (dev_flags & IFF_UP) return -EBUSY;
*should_update = true;
}
/* Can't change NAPI weight if the link is up */
napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) {
ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
vi->sq[queue_number].napi.weight,
&update_napi); if (ret) return ret;
if (update_napi) { /* All queues that belong to [queue_number, vi->max_queue_pairs] will be * updated for the sake of simplicity, which might not be necessary
*/ break;
}
}
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
ret = virtnet_send_notf_coal_cmds(vi, ec); else
ret = virtnet_coal_params_supported(ec);
if (ret) return ret;
if (update_napi) { /* xsk xmit depends on the tx napi. So if xsk is active, * prevent modifications to tx napi.
*/ for (i = queue_number; i < vi->max_queue_pairs; i++) { if (vi->sq[i].xsk_pool) return -EBUSY;
}
for (; queue_number < vi->max_queue_pairs; queue_number++)
vi->sq[queue_number].napi.weight = napi_weight;
}
/* Can't change NAPI weight if the link is up */
napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
ret = virtnet_should_update_vq_weight(dev->flags, napi_weight,
vi->sq[queue].napi.weight,
&update_napi); if (ret) return ret;
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); else
ret = virtnet_coal_params_supported(ec);
if (ret) return ret;
if (update_napi)
vi->sq[queue].napi.weight = napi_weight;
if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
rxfh->hfunc != ETH_RSS_HASH_TOP) return -EOPNOTSUPP;
if (rxfh->indir) { if (!vi->has_rss) return -EOPNOTSUPP;
for (i = 0; i < vi->rss_indir_table_size; ++i)
vi->rss_hdr->indirection_table[i] = cpu_to_le16(rxfh->indir[i]);
update = true;
}
if (rxfh->key) { /* If either _F_HASH_REPORT or _F_RSS are negotiated, the * device provides hash calculation capabilities, that is, * hash_key is configured.
*/ if (!vi->has_rss && !vi->has_rss_hash_report) return -EOPNOTSUPP;
_offloads = kzalloc(sizeof(*_offloads), GFP_KERNEL); if (!_offloads) return -ENOMEM;
*_offloads = cpu_to_virtio64(vi->vdev, offloads);
sg_init_one(&sg, _offloads, sizeof(*_offloads));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
dev_warn(&vi->dev->dev, "Fail to set guest offload.\n"); return -EINVAL;
}
if (vi->hdr_len > xsk_pool_get_headroom(pool)) return -EINVAL;
/* In big_packets mode, xdp cannot work, so there is no need to * initialize xsk of rq.
*/ if (vi->big_packets && !vi->mergeable_rx_bufs) return -ENOENT;
if (qid >= vi->curr_queue_pairs) return -EINVAL;
sq = &vi->sq[qid];
rq = &vi->rq[qid];
/* xsk assumes that tx and rx must have the same dma device. The af-xdp * may use one buffer to receive from the rx and reuse this buffer to * send by the tx. So the dma dev of sq and rq must be the same one. * * But vq->dma_dev allows every vq has the respective dma dev. So I * check the dma dev of vq and sq is the same dev.
*/ if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) return -EINVAL;
dma_dev = virtqueue_dma_dev(rq->vq); if (!dma_dev) return -EINVAL;
size = virtqueue_get_vring_size(rq->vq);
rq->xsk_buffs = kvcalloc(size, sizeof(*rq->xsk_buffs), GFP_KERNEL); if (!rq->xsk_buffs) return -ENOMEM;
err = xsk_pool_dma_map(pool, dma_dev, 0); if (err) goto err_xsk_map;
err = virtnet_rq_bind_xsk_pool(vi, rq, pool); if (err) goto err_rq;
err = virtnet_sq_bind_xsk_pool(vi, sq, pool); if (err) goto err_sq;
/* Now, we do not support tx offload(such as tx csum), so all the tx * virtnet hdr is zero. So all the tx packets can share a single hdr.
*/
sq->xsk_hdr_dma_addr = hdr_dma;
if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
&& (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6))) {
NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first"); return -EOPNOTSUPP;
}
if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required"); return -EINVAL;
}
if (prog && !prog->aux->xdp_has_frags && dev->mtu > max_sz) {
NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP without frags");
netdev_warn(dev, "single-buffer XDP requires MTU less than %u\n", max_sz); return -EINVAL;
}
curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs; if (prog)
xdp_qp = nr_cpu_ids;
/* XDP requires extra queues for XDP_TX */ if (curr_qp + xdp_qp > vi->max_queue_pairs) {
netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
curr_qp + xdp_qp, vi->max_queue_pairs);
xdp_qp = 0;
}
old_prog = rtnl_dereference(vi->rq[0].xdp_prog); if (!prog && !old_prog) return 0;
if (prog)
bpf_prog_add(prog, vi->max_queue_pairs - 1);
virtnet_rx_pause_all(vi);
/* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { for (i = 0; i < vi->max_queue_pairs; i++)
virtnet_napi_tx_disable(&vi->sq[i]);
}
if (!prog) { for (i = 0; i < vi->max_queue_pairs; i++) {
rcu_assign_pointer(vi->rq[i].xdp_prog, prog); if (i == 0)
virtnet_restore_guest_offloads(vi);
}
synchronize_net();
}
if (prog) {
vi->xdp_enabled = true; for (i = 0; i < vi->max_queue_pairs; i++) {
rcu_assign_pointer(vi->rq[i].xdp_prog, prog); if (i == 0 && !old_prog)
virtnet_clear_guest_offloads(vi);
} if (!old_prog)
xdp_features_set_redirect_target(dev, true);
} else {
xdp_features_clear_redirect_target(dev);
vi->xdp_enabled = false;
}
virtnet_rx_resume_all(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (old_prog)
bpf_prog_put(old_prog); if (netif_running(dev))
virtnet_napi_tx_enable(&vi->sq[i]);
}
return 0;
err: if (!prog) {
virtnet_clear_guest_offloads(vi); for (i = 0; i < vi->max_queue_pairs; i++)
rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
}
virtnet_rx_resume_all(vi); if (netif_running(dev)) { for (i = 0; i < vi->max_queue_pairs; i++)
virtnet_napi_tx_enable(&vi->sq[i]);
} if (prog)
bpf_prog_sub(prog, vi->max_queue_pairs - 1); return err;
}
staticvoid free_receive_page_frags(struct virtnet_info *vi)
{ int i; for (i = 0; i < vi->max_queue_pairs; i++) if (vi->rq[i].alloc_frag.page) { if (vi->rq[i].last_dma)
virtnet_rq_unmap(&vi->rq[i], vi->rq[i].last_dma, 0);
put_page(vi->rq[i].alloc_frag.page);
}
}
staticvoid virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
{ struct virtnet_info *vi = vq->vdev->priv; struct send_queue *sq; int i = vq2txq(vq);
sq = &vi->sq[i];
switch (virtnet_xmit_ptr_unpack(&buf)) { case VIRTNET_XMIT_TYPE_SKB: case VIRTNET_XMIT_TYPE_SKB_ORPHAN:
dev_kfree_skb(buf); break;
case VIRTNET_XMIT_TYPE_XDP:
xdp_return_frame(buf); break;
case VIRTNET_XMIT_TYPE_XSK:
xsk_tx_completed(sq->xsk_pool, 1); break;
}
}
staticvoid virtnet_sq_free_unused_buf_done(struct virtqueue *vq)
{ struct virtnet_info *vi = vq->vdev->priv; int i = vq2txq(vq);
/* How large should a single buffer be so a queue full of these can fit at * least one full packet? * Logic below assumes the mergeable buffer header is used.
*/ staticunsignedint mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
{ constunsignedint hdr_len = vi->hdr_len; unsignedint rq_size = virtqueue_get_vring_size(vq); unsignedint packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu; unsignedint buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len; unsignedint min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
staticint virtnet_find_vqs(struct virtnet_info *vi)
{ struct virtqueue_info *vqs_info; struct virtqueue **vqs; int ret = -ENOMEM; int total_vqs; bool *ctx;
u16 i;
/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by * possible control vq.
*/
total_vqs = vi->max_queue_pairs * 2 +
virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
/* Allocate space for find_vqs parameters */
vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL); if (!vqs) goto err_vq;
vqs_info = kcalloc(total_vqs, sizeof(*vqs_info), GFP_KERNEL); if (!vqs_info) goto err_vqs_info; if (!vi->big_packets || vi->mergeable_rx_bufs) {
ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL); if (!ctx) goto err_ctx;
} else {
ctx = NULL;
}
/* Parameters for control virtqueue, if any */ if (vi->has_cvq) {
vqs_info[total_vqs - 1].name = "control";
}
/* Allocate/initialize parameters for send/receive virtqueues */ for (i = 0; i < vi->max_queue_pairs; i++) {
vqs_info[rxq2vq(i)].callback = skb_recv_done;
vqs_info[txq2vq(i)].callback = skb_xmit_done;
sprintf(vi->rq[i].name, "input.%u", i);
sprintf(vi->sq[i].name, "output.%u", i);
vqs_info[rxq2vq(i)].name = vi->rq[i].name;
vqs_info[txq2vq(i)].name = vi->sq[i].name; if (ctx)
vqs_info[rxq2vq(i)].ctx = true;
}
ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, vqs_info, NULL); if (ret) goto err_find;
if (vi->has_cvq) {
vi->cvq = vqs[total_vqs - 1]; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
}
for (i = 0; i < vi->max_queue_pairs; i++) {
vi->rq[i].vq = vqs[rxq2vq(i)];
vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
vi->sq[i].vq = vqs[txq2vq(i)];
}
if (!virtnet_validate_features(vdev)) return -EINVAL;
if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { int mtu = virtio_cread16(vdev,
offsetof(struct virtio_net_config,
mtu)); if (mtu < MIN_MTU)
__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
}
if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY) &&
!virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
dev_warn(&vdev->dev, "device advertises feature VIRTIO_NET_F_STANDBY but not VIRTIO_NET_F_MAC, disabling standby");
__virtio_clear_bit(vdev, VIRTIO_NET_F_STANDBY);
}
/* If device can receive ANY guest GSO packets, regardless of mtu, * allocate packets of maximum size, otherwise limit it to only * mtu size worth only.
*/ if (mtu > ETH_DATA_LEN || guest_gso) {
vi->big_packets = true;
vi->big_packets_num_skbfrags = guest_gso ? MAX_SKB_FRAGS : DIV_ROUND_UP(mtu, PAGE_SIZE);
}
}
/* We need at least 2 queue's */ if (max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
max_queue_pairs = 1;
/* Allocate ourselves a network device with room for our info */
dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs); if (!dev) return -ENOMEM;
/* Set up network device as normal. */
dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
IFF_TX_SKB_NO_LINEAR;
dev->netdev_ops = &virtnet_netdev;
dev->stat_ops = &virtnet_stat_ops;
dev->features = NETIF_F_HIGHDMA;
/* Do we support "hardware" checksums? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */
dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (csum)
dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
dev->hw_features |= NETIF_F_TSO
| NETIF_F_TSO_ECN | NETIF_F_TSO6;
} /* Individual feature bits: what can host handle? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
dev->hw_features |= NETIF_F_TSO; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
dev->hw_features |= NETIF_F_TSO6; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
dev->hw_features |= NETIF_F_TSO_ECN; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_USO))
dev->hw_features |= NETIF_F_GSO_UDP_L4;
if (gso)
dev->features |= dev->hw_features; /* (!csum && gso) case will be fixed by register_netdev() */
}
/* 1. With VIRTIO_NET_F_GUEST_CSUM negotiation, the driver doesn't * need to calculate checksums for partially checksummed packets, * as they're considered valid by the upper layer. * 2. Without VIRTIO_NET_F_GUEST_CSUM negotiation, the driver only * receives fully checksummed packets. The device may assist in * validating these packets' checksums, so the driver won't have to.
*/
dev->features |= NETIF_F_RXCSUM;
if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
dev->features |= NETIF_F_GRO_HW; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
dev->hw_features |= NETIF_F_GRO_HW;
if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM))
vi->rx_tnl_csum = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO))
vi->rx_tnl = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO))
vi->tx_tnl = true;
if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
vi->any_header_sg = true;
if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
vi->has_cvq = true;
mutex_init(&vi->cvq_lock);
if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
mtu = virtio_cread16(vdev,
offsetof(struct virtio_net_config,
mtu)); if (mtu < dev->min_mtu) { /* Should never trigger: MTU was previously validated * in virtnet_validate.
*/
dev_err(&vdev->dev, "device MTU appears to have changed it is now %d < %d",
mtu, dev->min_mtu);
err = -EINVAL; goto free;
}
dev->mtu = mtu;
dev->max_mtu = mtu;
}
virtnet_set_big_packets(vi, mtu);
if (vi->any_header_sg)
dev->needed_headroom = vi->hdr_len;
/* Enable multiqueue by default */ if (num_online_cpus() >= max_queue_pairs)
vi->curr_queue_pairs = max_queue_pairs; else
vi->curr_queue_pairs = num_online_cpus();
vi->max_queue_pairs = max_queue_pairs;
/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
err = init_vqs(vi); if (err) goto free;
/* Keep the default values of the coalescing parameters * aligned with the default napi_tx state.
*/ if (vi->sq[0].napi.weight)
vi->intr_coal_tx.max_packets = 1; else
vi->intr_coal_tx.max_packets = 0;
}
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) { /* The reason is the same as VIRTIO_NET_F_NOTF_COAL. */ for (i = 0; i < vi->max_queue_pairs; i++) if (vi->sq[i].napi.weight)
vi->sq[i].intr_coal.max_packets = 1;
err = virtnet_init_irq_moder(vi); if (err) goto free;
}
/* Disable config change notification until ndo_open. */
virtio_config_driver_disable(vi->vdev);
virtio_device_ready(vdev);
if (vi->has_rss || vi->has_rss_hash_report) { if (!virtnet_commit_rss_command(vi)) {
dev_warn(&vdev->dev, "RSS disabled because committing failed.\n");
dev->hw_features &= ~NETIF_F_RXHASH;
vi->has_rss_hash_report = false;
vi->has_rss = false;
}
}
virtnet_set_queues(vi, vi->curr_queue_pairs);
/* a random MAC address has been assigned, notify the device. * We don't fail probe if VIRTIO_NET_F_CTRL_MAC_ADDR is not there * because many devices work fine without getting MAC explicitly
*/ if (!virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { struct scatterlist sg;
if (!virtnet_send_command_reply(vi, VIRTIO_NET_CTRL_STATS,
VIRTIO_NET_CTRL_STATS_QUERY,
NULL, &sg)) {
pr_debug("virtio_net: fail to get stats capability\n");
rtnl_unlock();
err = -EINVAL; goto free_unregister_netdev;
}
v = stats_cap->supported_stats_types[0];
vi->device_stats_cap = le64_to_cpu(v);
}
/* Assume link up if device can't report link status,
otherwise get link status from config. */
netif_carrier_off(dev); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
virtio_config_changed(vi->vdev);
} else {
vi->status = VIRTIO_NET_S_LINK_UP;
virtnet_update_settings(vi);
netif_carrier_on(dev);
}
for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) { unsignedint fbit;
staticvoid remove_vq_common(struct virtnet_info *vi)
{ int i;
virtio_reset_device(vi->vdev);
/* Free unused buffers in both send and recv, if any. */
free_unused_bufs(vi);
/* * Rule of thumb is netdev_tx_reset_queue() should follow any * skb freeing not followed by netdev_tx_completed_queue()
*/ for (i = 0; i < vi->max_queue_pairs; i++)
netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i));
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.186Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-28)
¤