/* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel.
*/
#define FLT_EXACT_COUNT 8 struct tap_filter { unsignedint count; /* Number of addrs. Zero means disabled */
u32 mask[2]; /* Mask of the hashed addrs */ unsignedchar addr[FLT_EXACT_COUNT][ETH_ALEN];
};
/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
* to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096
#define TUN_FLOW_EXPIRE (3 * HZ)
/* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held.
*/ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsignedint flags; union {
u16 queue_index; unsignedint ifindex;
}; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq;
};
struct tun_page { struct page *page; int count;
};
/* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device.
*/ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsignedint numqueues; unsignedint flags;
kuid_t owner;
kgid_t group;
e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index)
WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies)
e->updated = jiffies;
sock_rps_record_flow_hash(e->rps_rxhash);
} else {
spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) &&
tun->flow_count < MAX_TAP_FLOWS)
tun_flow_create(tun, head, rxhash, queue_index);
if (!timer_pending(&tun->flow_gc_timer))
mod_timer(&tun->flow_gc_timer,
round_jiffies_up(jiffies + delay));
spin_unlock_bh(&tun->lock);
}
rcu_read_unlock();
}
/* Save the hash received in the stack receive path and update the * flow_hash table accordingly.
*/ staticinlinevoid tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{ if (unlikely(e->rps_rxhash != hash))
e->rps_rxhash = hash;
}
/* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here.
*/ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{ struct tun_flow_entry *e;
u32 txq, numqueues;
rcu_read_lock(); if (rcu_dereference(tun->steering_prog))
ret = tun_ebpf_select_queue(tun, skb); else
ret = tun_automq_select_queue(tun, skb);
rcu_read_unlock();
rtnl_lock();
tun = rtnl_dereference(tfile->tun);
dev = tun ? tun->dev : NULL;
__tun_detach(tfile, clean); if (dev)
netdev_state_change(dev);
rtnl_unlock();
if (clean)
sock_put(&tfile->sk);
}
staticvoid tun_detach_all(struct net_device *dev)
{ struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues;
/* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst
* case we'll accept a few undesired packets. */
filter->count = 0;
wmb();
/* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
nexact = n;
/* Remaining multicast addresses are hashed,
* unicast will leave the filter disabled. */
memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) {
err = 0; /* no filter */ goto free_addr;
}
addr_hash_set(filter->mask, addr[n].u);
}
/* For ALLMULTI just set the mask to all ones.
* This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI))
memset(filter->mask, ~0, sizeof(filter->mask));
/* Now enable the filter */
wmb();
filter->count = nexact;
/* Return the number of exact filters */
err = nexact;
free_addr:
kfree(addr); return err;
}
/* Returns: 0 - drop, !=0 - accept */ staticint run_filter(struct tap_filter *filter, conststruct sk_buff *skb)
{ /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
* at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i;
/* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1;
/* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest);
return 0;
}
/* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept
*/ staticint check_filter(struct tap_filter *filter, conststruct sk_buff *skb)
{ if (!filter->count) return 1;
/* Net device start xmit */ staticvoid tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{ #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here.
*/ struct tun_flow_entry *e;
__u32 rxhash;
rxhash = __skb_get_hash_symmetric(skb);
e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e)
tun_flow_save_rps_rxhash(e, rxhash);
} #endif
}
/* Drop if the filter does not like it. * This is a noop if the filter is disabled.
* Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) {
drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop;
}
if (tfile->socket.sk->sk_filter &&
sk_filter_reason(tfile->socket.sk, skb, &drop_reason)) goto drop;
len = run_ebpf_filter(tun, skb, len); if (len == 0) {
drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop;
}
if (pskb_trim(skb, len)) {
drop_reason = SKB_DROP_REASON_NOMEM; goto drop;
}
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop;
}
skb_tx_timestamp(skb);
/* Orphan the skb - required as we might hang on to it * for indefinite time.
*/
skb_orphan(skb);
nf_reset_ct(skb);
if (ptr_ring_produce(&tfile->tx_ring, skb)) {
drop_reason = SKB_DROP_REASON_FULL_RING; goto drop;
}
/* dev->lltx requires to do our own update of trans_start */
queue = netdev_get_tx_queue(dev, txq);
txq_trans_cond_update(queue);
/* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC)
kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
staticvoid tun_net_mclist(struct net_device *dev)
{ /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us.
*/
}
staticvoid __tun_xdp_flush_tfile(struct tun_file *tfile)
{ /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC)
kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
}
staticint tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags)
{ struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile;
u32 numqueues; int nxmit = 0; int i;
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL;
rcu_read_lock();
resample:
numqueues = READ_ONCE(tun->numqueues); if (!numqueues) {
rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */
}
tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
numqueues]); if (unlikely(!tfile)) goto resample;
spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff.
*/ void *frame = tun_xdp_to_ptr(xdp);
if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
dev_core_stats_tx_dropped_inc(dev); break;
}
nxmit++;
}
spin_unlock(&tfile->tx_ring.producer_lock);
if (flags & XDP_XMIT_FLUSH)
__tun_xdp_flush_tfile(tfile);
if (!ptr_ring_empty(&tfile->tx_ring))
mask |= EPOLLIN | EPOLLRDNORM;
/* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO.
*/ if (tun_sock_writeable(tun, tfile) ||
(!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
tun_sock_writeable(tun, tfile)))
mask |= EPOLLOUT | EPOLLWRNORM;
if (tun->dev->reg_state != NETREG_REGISTERED)
mask = EPOLLERR;
tun_put(tun); return mask;
}
staticstruct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
size_t len, conststruct iov_iter *it)
{ struct sk_buff *skb;
size_t linear; int err; int i;
if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE);
local_bh_disable();
skb = napi_get_frags(&tfile->napi);
local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM);
linear = iov_iter_single_seg_count(it);
err = __skb_grow(skb, linear); if (err) goto free;
return skb;
free: /* frees skb and all frags allocated with napi_alloc_frag() */
napi_free_frags(&tfile->napi); return ERR_PTR(err);
}
/* prepad is the amount to reserve at front. len is length after that.
* linear is a hint as to how much to copy (usually headers). */ staticstruct sk_buff *tun_alloc_skb(struct tun_file *tfile,
size_t prepad, size_t len,
size_t linear, int noblock)
{ struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err;
/* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE)
linear = len;
if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
&err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err);
skb_reserve(skb, prepad);
skb_put(skb, linear);
skb->data_len = len - linear;
skb->len += len - linear;
/* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough.
*/ if (hdr->gso_type || !xdp_prog) {
*skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
pad, metasize);
}
if (err == XDP_REDIRECT)
xdp_do_flush(); if (err != XDP_PASS) goto out;
pad = xdp.data - xdp.data_hard_start;
len = xdp.data_end - xdp.data;
/* It is known that the xdp_buff was prepared with metadata * support, so the metasize will never be negative.
*/
metasize = xdp.data - xdp.data_meta;
}
bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock();
local_bh_enable();
/* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more)
{ struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb;
size_t total_len = iov_iter_count(from);
size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso; int good_linear; int copylen; int hdr_len = 0; bool zerocopy = false; int err;
u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
netdev_features_t features = 0;
/* * Keep it easy and always zero the whole buffer, even if the * tunnel-related field will be touched only when the feature * is enabled and the hdr size id compatible.
*/
memset(&hdr, 0, sizeof(hdr));
gso = (struct virtio_net_hdr *)&hdr;
if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL;
len -= sizeof(pi);
if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT;
}
if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
features = tun_vnet_hdr_guest_features(vnet_hdr_sz);
hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags,
features, from, gso); if (hdr_len < 0) return hdr_len;
/* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace.
*/
copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear);
linear = copylen;
iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
zerocopy = true;
}
if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine.
*/
skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp);
err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len;
} else { if (!zerocopy) {
copylen = len;
linear = min(hdr_len, good_linear);
}
if (frags) {
mutex_lock(&tfile->napi_mutex);
skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter().
*/
zerocopy = false;
} else { if (!linear)
linear = min_t(size_t, good_linear, copylen);
/* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize.
*/ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
!tfile->detached)
rxhash = __skb_get_hash_symmetric(skb);
/* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter)
{ struct tun_pi pi = { 0, skb->protocol };
ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; int ret;
if (skb_vlan_tag_present(skb))
vlan_hlen = VLAN_HLEN;
if (tun->flags & IFF_VNET_HDR)
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
total = skb->len + vlan_hlen + vnet_hdr_sz;
if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL;
total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */
pi.flags |= TUN_PKT_STRIP;
}
if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT;
}
if (vnet_hdr_sz) { struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso;
ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb,
&hdr); if (ret) return ret;
/* * Drop the packet if the configured header size is too small * WRT the enabled offloads.
*/
gso = (struct virtio_net_hdr *)&hdr;
ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features,
iter, gso); if (ret) return ret;
}
/* Trivial set of netlink ops to allow deleting tun or tap * device with netlink.
*/ staticint tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack)
{
NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP;
}
/* The externally provided xdp_buff may have no metadata support, which * is marked by xdp->data_meta being xdp->data + 1. This will lead to a * metasize of -1 and is the reason why the condition checks for > 0.
*/
metasize = xdp->data - xdp->data_meta; if (metasize > 0)
skb_metadata_set(skb, metasize);
features = tun_vnet_hdr_guest_features(READ_ONCE(tun->vnet_hdr_sz));
tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso; if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) {
atomic_long_inc(&tun->rx_frame_errors);
kfree_skb(skb);
ret = -EINVAL; goto out;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.