/* * RCU usage: * The tap_queue and the macvlan_dev are loosely coupled, the * pointers from one to the other can only be read while rcu_read_lock * or rtnl is held. * * Both the file and the macvlan_dev hold a reference on the tap_queue * through sock_hold(&q->sk). When the macvlan_dev goes away first, * q->vlan becomes inaccessible. When the files gets closed, * tap_get_queue() fails. * * There may still be references to the struct sock inside of the * queue from outbound SKBs, but these never reference back to the * file or the dev. The data structure is freed through __sk_free * when both our references and any pending SKBs are gone.
*/
/* * The file owning the queue got closed, give up both * the reference that the files holds as well as the * one from the macvlan_dev if that still exists. * * Using the spinlock makes sure that we don't get * to the queue again after destroying it.
*/ staticvoid tap_put_queue(struct tap_queue *q)
{ struct tap_dev *tap;
rtnl_lock();
tap = rtnl_dereference(q->tap);
if (tap) { if (q->enabled)
BUG_ON(tap_disable_queue(q));
/* * Select a queue based on the rxq of the device on which this packet * arrived. If the incoming device is not mq, calculate a flow hash * to select a queue. If all fails, find the first available queue. * Cache vlan->numvtaps since it can become zero during the execution * of this function.
*/ staticstruct tap_queue *tap_get_queue(struct tap_dev *tap, struct sk_buff *skb)
{ struct tap_queue *queue = NULL; /* Access to taps array is protected by rcu, but access to numvtaps * isn't. Below we use it to lookup a queue, but treat it as a hint * and validate that the result isn't NULL - in case we are * racing against queue removal.
*/ int numvtaps = READ_ONCE(tap->numvtaps);
__u32 rxq;
if (!numvtaps) goto out;
if (numvtaps == 1) goto single;
/* Check if we can use flow to select a queue */
rxq = skb_get_hash(skb); if (rxq) {
queue = rcu_dereference(tap->taps[rxq % numvtaps]); goto out;
}
if (likely(skb_rx_queue_recorded(skb))) {
rxq = skb_get_rx_queue(skb);
while (unlikely(rxq >= numvtaps))
rxq -= numvtaps;
/* * The net_device is going away, give up the reference * that it holds on all queues and safely set the pointer * from the queues to NULL.
*/ void tap_del_queues(struct tap_dev *tap)
{ struct tap_queue *q, *tmp;
ASSERT_RTNL();
list_for_each_entry_safe(q, tmp, &tap->queue_list, next) {
list_del_init(&q->next);
RCU_INIT_POINTER(q->tap, NULL); if (q->enabled)
tap->numvtaps--;
tap->numqueues--;
sock_put(&q->sk);
}
BUG_ON(tap->numvtaps);
BUG_ON(tap->numqueues); /* guarantee that any future tap_set_queue will fail */
tap->numvtaps = MAX_TAP_QUEUES;
}
EXPORT_SYMBOL_GPL(tap_del_queues);
tap = tap_dev_get_rcu(dev); if (!tap) return RX_HANDLER_PASS;
q = tap_get_queue(tap, skb); if (!q) return RX_HANDLER_PASS;
skb_push(skb, ETH_HLEN);
/* Apply the forward feature mask so that we perform segmentation * according to users wishes. This only works if VNET_HDR is * enabled.
*/ if (q->flags & IFF_VNET_HDR)
features |= tap->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next;
if (IS_ERR(segs)) {
drop_reason = SKB_DROP_REASON_SKB_GSO_SEG; goto drop;
}
if (!segs) { if (ptr_ring_produce(&q->ring, skb)) {
drop_reason = SKB_DROP_REASON_FULL_RING; goto drop;
} goto wake_up;
}
consume_skb(skb);
skb_list_walk_safe(segs, skb, next) {
skb_mark_not_on_list(skb); if (ptr_ring_produce(&q->ring, skb)) {
drop_reason = SKB_DROP_REASON_FULL_RING;
kfree_skb_reason(skb, drop_reason);
kfree_skb_list_reason(next, drop_reason); break;
}
}
} else { /* If we receive a partial checksum and the tap side * doesn't support checksum offload, compute the checksum. * Note: it doesn't matter which checksum feature to * check, we either support them all or none.
*/ if (skb->ip_summed == CHECKSUM_PARTIAL &&
!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb)) {
drop_reason = SKB_DROP_REASON_SKB_CSUM; goto drop;
} if (ptr_ring_produce(&q->ring, skb)) {
drop_reason = SKB_DROP_REASON_FULL_RING; goto drop;
}
}
rcu_read_lock();
tap_major = tap_get_major(major); if (!tap_major) {
tap = NULL; goto unlock;
}
spin_lock(&tap_major->minor_lock);
tap = idr_find(&tap_major->minor_idr, minor); if (tap) {
dev = tap->dev;
dev_hold(dev);
}
spin_unlock(&tap_major->minor_lock);
/* * so far only KVM virtio_net uses tap, enable zero copy between * guest kernel and host kernel when lower device supports zerocopy * * The macvlan supports zerocopy iff the lower device supports zero * copy so we don't have to look at the lower device directly.
*/ if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
sock_set_flag(&q->sk, SOCK_ZEROCOPY);
err = tap_set_queue(tap, file, q); if (err) { /* tap_sock_destruct() will take care of freeing ptr_ring */ goto err_put;
}
/* tap groks IOCB_NOWAIT just fine, mark it as such */
file->f_mode |= FMODE_NOWAIT;
dev_put(tap->dev);
rtnl_unlock(); return err;
err_put:
sock_put(&q->sk);
err: if (tap)
dev_put(tap->dev);
rcu_read_lock();
tap = rcu_dereference(q->tap); if (!tap) {
kfree_skb(skb);
rcu_read_unlock(); return total_len;
}
skb->dev = tap->dev;
if (vnet_hdr_len) {
err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr); if (err) {
rcu_read_unlock();
drop_reason = SKB_DROP_REASON_DEV_HDR; goto err_kfree;
}
}
skb_probe_transport_header(skb);
/* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) &&
vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
skb_set_network_header(skb, depth);
/* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) {
skb_zcopy_init(skb, msg_control);
} elseif (msg_control) { struct ubuf_info *uarg = msg_control;
uarg->ops->complete(NULL, uarg, false);
}
/* Put packet to the user space buffer */ static ssize_t tap_put_user(struct tap_queue *q, conststruct sk_buff *skb, struct iov_iter *iter)
{ int ret; int vnet_hdr_len = 0; int vlan_offset = 0; int total;
if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); if (ret) return ret;
ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr); if (ret) return ret;
}
total = vnet_hdr_len;
total += skb->len;
vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
total += VLAN_HLEN;
ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done;
ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done;
}
ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
skb->len - vlan_offset);
done: return ret ? ret : total;
}
static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb)
{
DEFINE_WAIT(wait);
ssize_t ret = 0;
if (!iov_iter_count(to)) {
kfree_skb(skb); return 0;
}
if (skb) goto put;
while (1) { if (!noblock)
prepare_to_wait(sk_sleep(&q->sk), &wait,
TASK_INTERRUPTIBLE);
/* Read frames from the queue */
skb = ptr_ring_consume(&q->ring); if (skb) break; if (noblock) {
ret = -EAGAIN; break;
} if (signal_pending(current)) {
ret = -ERESTARTSYS; break;
} /* Nothing to read, let's sleep */
schedule();
} if (!noblock)
finish_wait(sk_sleep(&q->sk), &wait);
put: if (skb) {
ret = tap_put_user(q, skb, to); if (unlikely(ret < 0))
kfree_skb(skb); else
consume_skb(skb);
} return ret;
}
tap = rtnl_dereference(q->tap); if (!tap) return -ENOLINK;
features = tap->dev->features;
if (arg & TUN_F_CSUM) {
feature_mask = NETIF_F_HW_CSUM;
if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN)
feature_mask |= NETIF_F_TSO_ECN; if (arg & TUN_F_TSO4)
feature_mask |= NETIF_F_TSO; if (arg & TUN_F_TSO6)
feature_mask |= NETIF_F_TSO6;
}
/* TODO: for now USO4 and USO6 should work simultaneously */ if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6))
features |= NETIF_F_GSO_UDP_L4;
}
/* tun/tap driver inverts the usage for TSO offloads, where * setting the TSO bit means that the userspace wants to * accept TSO frames and turning it off means that user space * does not support TSO. * For tap, we have to invert it to mean the same thing. * When user space turns off TSO, we turn off GSO/LRO so that * user-space will not receive TSO frames.
*/ if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) ||
(feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6))
features |= RX_OFFLOADS; else
features &= ~RX_OFFLOADS;
/* tap_features are the same as features on tun/tap and * reflect user expectations.
*/
tap->tap_features = feature_mask; if (tap->update_features)
tap->update_features(tap, features);
switch (cmd) { case TUNSETIFF: /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT;
ret = 0; if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP))
ret = -EINVAL; else
q->flags = (q->flags & ~TAP_IFFEATURES) | u;
return ret;
case TUNGETIFF:
rtnl_lock();
tap = tap_get_tap_dev(q); if (!tap) {
rtnl_unlock(); return -ENOLINK;
}
ret = 0;
u = q->flags; if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
put_user(u, &ifr->ifr_flags))
ret = -EFAULT;
tap_put_tap_dev(tap);
rtnl_unlock(); return ret;
case TUNSETQUEUE: if (get_user(u, &ifr->ifr_flags)) return -EFAULT;
rtnl_lock();
ret = tap_ioctl_set_queue(file, u);
rtnl_unlock(); return ret;
case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up)) return -EFAULT; return 0;
case TUNSETSNDBUF: if (get_user(s, sp)) return -EFAULT; if (s <= 0) return -EINVAL;
q->sk.sk_sndbuf = s; return 0;
case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
TUN_F_TSO_ECN | TUN_F_UFO |
TUN_F_USO4 | TUN_F_USO6)) return -EINVAL;
rtnl_lock();
ret = set_offload(q, arg);
rtnl_unlock(); return ret;
case SIOCGIFHWADDR:
rtnl_lock();
tap = tap_get_tap_dev(q); if (!tap) {
rtnl_unlock(); return -ENOLINK;
}
ret = 0;
netif_get_mac_address((struct sockaddr *)&ss, dev_net(tap->dev),
tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
copy_to_user(&ifr->ifr_hwaddr, &ss, sizeof(ifr->ifr_hwaddr)))
ret = -EFAULT;
tap_put_tap_dev(tap);
rtnl_unlock(); return ret;
case SIOCSIFHWADDR: if (copy_from_user(&ss, &ifr->ifr_hwaddr, sizeof(ifr->ifr_hwaddr))) return -EFAULT;
rtnl_lock();
tap = tap_get_tap_dev(q); if (!tap) {
rtnl_unlock(); return -ENOLINK;
} if (tap->dev->addr_len > sizeof(ifr->ifr_hwaddr))
ret = -EINVAL; else
ret = dev_set_mac_address_user(tap->dev, &ss, NULL);
tap_put_tap_dev(tap);
rtnl_unlock(); return ret;
if (vnet_hdr_len) {
err = tun_vnet_hdr_to_skb(q->flags, skb, gso); if (err) goto err_kfree;
}
/* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) &&
vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
skb_set_network_header(skb, depth);
rcu_read_lock();
tap = rcu_dereference(q->tap); if (tap) {
skb->dev = tap->dev;
skb_probe_transport_header(skb);
dev_queue_xmit(skb);
} else {
kfree_skb(skb);
}
rcu_read_unlock();
return 0;
err_kfree:
kfree_skb(skb);
err:
rcu_read_lock();
tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped)
tap->count_tx_dropped(tap);
rcu_read_unlock(); return err;
}
/* Ops structure to mimic raw sockets with tun */ staticconststruct proto_ops tap_socket_ops = {
.sendmsg = tap_sendmsg,
.recvmsg = tap_recvmsg,
.peek_len = tap_peek_len,
};
/* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
* holding a reference to the file for as long as the socket is in use. */ struct socket *tap_get_socket(struct file *file)
{ struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL);
q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->sock;
}
EXPORT_SYMBOL_GPL(tap_get_socket);
if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL);
q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->ring;
}
EXPORT_SYMBOL_GPL(tap_get_ptr_ring);
int tap_queue_resize(struct tap_dev *tap)
{ struct net_device *dev = tap->dev; struct tap_queue *q; struct ptr_ring **rings; int n = tap->numqueues; int ret, i = 0;
rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.