// SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@redhat.com> * * virtio-net server in host kernel.
*/
/* Max number of bytes transferred before requeueing the job.
* Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000
/* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with small * pkts.
*/ #define VHOST_NET_PKT_WEIGHT 256
/* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256
/* * For transmit, used buffer len is unused; we override it to track buffer * status internally; used for zerocopy tx only.
*/ /* Lower device DMA failed */ #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) /* Lower device DMA done */ #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) /* Lower device DMA in progress */ #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) /* Buffer unused */ #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released * 1: no outstanding ubufs * >1: outstanding ubufs
*/
atomic_t refcount;
wait_queue_head_t wait; struct vhost_virtqueue *vq; struct rcu_head rcu;
};
#define VHOST_NET_BATCH 64 struct vhost_net_buf { void **queue; int tail; int head;
};
struct vhost_net_virtqueue { struct vhost_virtqueue vq;
size_t vhost_hlen;
size_t sock_hlen; /* vhost zerocopy support fields below: */ /* last used idx for outstanding DMA zerocopy buffers */ int upend_idx; /* For TX, first used idx for DMA done zerocopy buffers * For RX, number of batched heads
*/ int done_idx; /* Number of XDP frames batched */ int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info_msgzc *ubuf_info; /* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp;
};
struct vhost_net { struct vhost_dev dev; struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted.
* Protected by tx vq lock. */ unsigned tx_packets; /* Number of times zerocopy TX recently failed.
* Protected by tx vq lock. */ unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; /* Private page frag cache */ struct page_frag_cache pf_cache;
};
/* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx.
*/ staticvoid vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq)
{ struct vhost_net_virtqueue *nvq =
container_of(vq, struct vhost_net_virtqueue, vq); int i, add; int j = 0;
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
++j;
} else break;
} while (j) {
add = min(UIO_MAXIOV - nvq->done_idx, j);
vhost_add_used_and_signal_n(vq->dev, vq,
&vq->heads[nvq->done_idx],
NULL, add);
nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
j -= add;
}
}
/* set len to mark this desc buffers done DMA */
vq->heads[ubuf->desc].len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);
/* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1. * We also trigger polling periodically after each 16 packets * (the value 16 here is more or less arbitrary, it's tuned to trigger * less than 10% of times).
*/ if (cnt <= 1 || !(cnt % 16))
vhost_poll_queue(&vq->poll);
/* free pages owned by XDP; since this is an unlikely error path, * keep it simple and avoid more complex bulk update for the * used pages
*/ for (i = 0; i < nvq->batched_xdp; ++i)
put_page(virt_to_head_page(nvq->xdp[i].data));
nvq->batched_xdp = 0;
nvq->done_idx = 0; return;
}
/* Try to hold the vq mutex of the paired virtqueue. We can't * use mutex_lock() here since we could not guarantee a * consistenet lock ordering.
*/ if (!mutex_trylock(&vq->mutex)) return;
while (vhost_can_busy_poll(endtime)) { if (vhost_vq_has_work(vq)) {
*busyloop_intr = true; break;
}
if ((sock_has_rx_data(sock) &&
!vhost_vq_avail_empty(&net->dev, rvq)) ||
!vhost_vq_avail_empty(&net->dev, tvq)) break;
cpu_relax();
}
preempt_enable();
if (poll_rx || sock_has_rx_data(sock))
vhost_net_busy_poll_try_queue(net, vq); elseif (!poll_rx) /* On tx here, sock has no rx data. */
vhost_enable_notify(&net->dev, rvq);
if (nvq->done_idx == VHOST_NET_BATCH)
vhost_tx_batch(net, nvq, sock, &msg);
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
&busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { /* Flush batched packets to handle pending RX * work (if busyloop_intr is set) and to avoid * unnecessary virtqueue kicks.
*/
vhost_tx_batch(net, nvq, sock, &msg); if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} elseif (unlikely(vhost_enable_notify(&net->dev,
vq))) {
vhost_disable_notify(&net->dev, vq); continue;
} break;
}
total_len += len;
/* For simplicity, TX batching is only enabled if * sndbuf is unlimited.
*/ if (sock_can_batch) {
err = vhost_net_build_xdp(nvq, &msg.msg_iter); if (!err) { goto done;
} elseif (unlikely(err != -ENOSPC)) {
vhost_tx_batch(net, nvq, sock, &msg);
vhost_discard_vq_desc(vq, 1);
vhost_net_enable_vq(net, vq); break;
}
if (nvq->batched_xdp) { /* We can't build XDP buff, go for single * packet path but let's flush batched * packets.
*/
vhost_tx_batch(net, nvq, sock, &msg);
}
msg.msg_control = NULL;
} else { if (tx_can_batch(vq, total_len))
msg.msg_flags |= MSG_MORE; else
msg.msg_flags &= ~MSG_MORE;
}
/* Release DMAs done buffers first */
vhost_zerocopy_signal_used(net, vq);
busyloop_intr = false;
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
&busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} elseif (unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq); continue;
} break;
}
zcopy_used = len >= VHOST_GOODCOPY_LEN
&& !vhost_exceeds_maxpend(net)
&& vhost_net_tx_select_zcopy(net);
if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS)
vhost_net_ubuf_put(ubufs); if (retry)
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV; else
vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
} if (retry) {
vhost_discard_vq_desc(vq, 1);
vhost_net_enable_vq(net, vq); break;
}
pr_debug("Fail to send packet: err %d", err);
} elseif (unlikely(err != len))
pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used)
vhost_add_used_and_signal(&net->dev, vq, head, 0); else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
}
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */ staticvoid handle_tx(struct vhost_net *net)
{ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock;
mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
sock = vhost_vq_get_backend(vq); if (!sock) goto out;
if (vhost_sock_zcopy(sock))
handle_tx_zerocopy(net, sock); else
handle_tx_copy(net, sock);
out:
mutex_unlock(&vq->mutex);
}
staticint peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
{ struct sk_buff *head; int len = 0; unsignedlong flags;
if (rvq->rx_ring) return vhost_net_buf_peek(rvq);
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue); if (likely(head)) {
len = head->len; if (skb_vlan_tag_present(head))
len += VLAN_HLEN;
}
if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */
vhost_net_signal_used(rnvq, *count);
*count = 0; /* Both tx vq and rx socket were polled here */
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
len = peek_head_len(rnvq, sk);
}
return len;
}
/* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @nvq - the relevant vhost_net virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error
*/ staticint get_rx_bufs(struct vhost_net_virtqueue *nvq, struct vring_used_elem *heads,
u16 *nheads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsignedint quota)
{ struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); unsignedint out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0.
*/
u32 len;
while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) {
r = -ENOBUFS; goto err;
}
r = vhost_get_vq_desc(vq, vq->iov + seg,
ARRAY_SIZE(vq->iov) - seg, &out,
&in, log, log_num); if (unlikely(r < 0)) goto err;
d = r; if (d == vq->num) {
r = 0; goto err;
} if (unlikely(out || in <= 0)) {
vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in);
r = -EINVAL; goto err;
} if (unlikely(log)) {
nlogs += *log_num;
log += *log_num;
}
len = iov_length(vq->iov + seg, in); if (!in_order) {
heads[headcount].id = cpu_to_vhost32(vq, d);
heads[headcount].len = cpu_to_vhost32(vq, len);
}
++headcount;
datalen -= len;
seg += in;
}
*iovcount = seg; if (unlikely(log))
*log_num = nlogs;
/* Detect overrun */ if (unlikely(datalen > 0)) {
r = UIO_MAXIOV + 1; goto err;
}
if (!in_order)
heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); else {
heads[0].len = cpu_to_vhost32(vq, len + datalen);
heads[0].id = cpu_to_vhost32(vq, d);
nheads[0] = headcount;
}
do {
sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
&busyloop_intr, &count); if (!sock_len) break;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(nvq, vq->heads + count,
vq->nheads + count,
vhost_len, &in, vq_log, &log,
likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} elseif (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were
* doing that: check again. */
vhost_disable_notify(&net->dev, vq); continue;
} /* Nothing new? Wait for eventfd to tell us
* they refilled. */ goto out;
}
busyloop_intr = false; if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) {
iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1);
err = sock->ops->recvmsg(sock, &msg,
1, MSG_DONTWAIT | MSG_TRUNC);
pr_debug("Discarded rx packet: len %zd\n", sock_len); continue;
} /* We don't need to be notified again. */
iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len);
fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO.
*/
iov_iter_advance(&msg.msg_iter, vhost_hlen);
}
err = sock->ops->recvmsg(sock, &msg,
sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard
* to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len);
vhost_discard_vq_desc(vq, headcount); continue;
} /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr),
&fixup) != sizeof(hdr)) {
vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); goto out;
}
} else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
*/
iov_iter_advance(&fixup, sizeof(hdr));
} /* TODO: Should check and handle checksum. */
vhost_net_stop(n, &tx_sock, &rx_sock);
vhost_net_flush(n);
vhost_dev_stop(&n->dev);
vhost_dev_cleanup(&n->dev);
vhost_net_vq_reset(n); if (tx_sock)
sockfd_put(tx_sock); if (rx_sock)
sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */
synchronize_rcu(); /* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
kfree(n->dev.vqs);
page_frag_cache_drain(&n->pf_cache);
kvfree(n); return 0;
}
/* special case to disable backend */ if (fd == -1) return NULL;
sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock;
sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK);
}
mutex_lock(&n->dev.mutex);
r = vhost_dev_check_owner(&n->dev); if (r) goto err;
if (index >= VHOST_NET_VQ_MAX) {
r = -ENOBUFS; goto err;
}
vq = &n->vqs[index].vq;
nvq = &n->vqs[index];
mutex_lock(&vq->mutex);
if (fd == -1)
vhost_clear_msg(&n->dev);
/* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) {
r = -EFAULT; goto err_vq;
}
sock = get_socket(fd); if (IS_ERR(sock)) {
r = PTR_ERR(sock); goto err_vq;
}
/* start polling new socket */
oldsock = vhost_vq_get_backend(vq); if (sock != oldsock) {
ubufs = vhost_net_ubuf_alloc(vq,
sock && vhost_sock_zcopy(sock)); if (IS_ERR(ubufs)) {
r = PTR_ERR(ubufs); goto err_ubufs;
}
vhost_net_disable_vq(n, vq);
vhost_vq_set_backend(vq, sock);
vhost_net_buf_unproduce(nvq);
r = vhost_vq_init_access(vq); if (r) goto err_used;
r = vhost_net_enable_vq(n, vq); if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock)
nvq->rx_ring = get_tap_ptr_ring(sock->file); else
nvq->rx_ring = NULL;
}
staticlong vhost_net_set_owner(struct vhost_net *n)
{ int r;
mutex_lock(&n->dev.mutex); if (vhost_dev_has_owner(&n->dev)) {
r = -EBUSY; goto out;
}
r = vhost_net_set_ubuf_info(n); if (r) goto out;
r = vhost_dev_set_owner(&n->dev); if (r)
vhost_net_clear_ubuf_info(n);
vhost_net_flush(n);
out:
mutex_unlock(&n->dev.mutex); return r;
}
switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES:
features = vhost_net_features[0]; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~vhost_net_features[0]) return -EOPNOTSUPP;
virtio_features_from_u64(all_features, features); return vhost_net_set_features(n, all_features); case VHOST_GET_FEATURES_ARRAY: if (copy_from_user(&count, featurep, sizeof(count))) return -EFAULT;
/* Copy the net features, up to the user-provided buffer size */
argp += sizeof(u64);
copied = min(count, VIRTIO_FEATURES_DWORDS); if (copy_to_user(argp, vhost_net_features,
copied * sizeof(u64))) return -EFAULT;
/* Zero the trailing space provided by user-space, if any */ if (clear_user(argp, size_mul(count - copied, sizeof(u64)))) return -EFAULT; return 0; case VHOST_SET_FEATURES_ARRAY: if (copy_from_user(&count, featurep, sizeof(count))) return -EFAULT;
/* * Any feature specified by user-space above * VIRTIO_FEATURES_MAX is not supported by definition.
*/ for (i = copied; i < count; ++i) { if (copy_from_user(&features, featurep + 1 + i, sizeof(features))) return -EFAULT; if (features) return -EOPNOTSUPP;
}
for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++) if (all_features[i] & ~vhost_net_features[i]) return -EOPNOTSUPP;
return vhost_net_set_features(n, all_features); case VHOST_GET_BACKEND_FEATURES:
features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP;
vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default:
mutex_lock(&n->dev.mutex);
r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD)
r = vhost_vring_ioctl(&n->dev, ioctl, argp); else
vhost_net_flush(n);
mutex_unlock(&n->dev.mutex); return r;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.