// SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers
*/
/* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs.
*/
u32 shared_tables;
bool strict_mode;
};
struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */
u32 table_id; int users; int ifindex;
};
staticunsignedint vrf_net_id;
/* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules;
/* called with rtnl lock held */ staticint
vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack)
{ struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me;
u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res;
/* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible).
*/
new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM;
me = vrf_map_lookup_elem(vmap, table_id); if (!me) {
me = new_me;
vrf_map_add_elem(vmap, me); goto link_vrf;
}
/* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table.
*/
free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */
NL_SET_ERR_MSG(extack, "Table is used by another VRF");
res = -EBUSY; goto unlock;
}
link_vrf:
users = ++me->users; if (users == 2)
++vmap->shared_tables;
list_add(&vrf->me_list, &me->vrf_list);
res = 0;
unlock:
vrf_map_unlock(vmap);
/* clean-up, if needed */ if (free_new_me)
vrf_map_elem_free(new_me);
return res;
}
/* called with rtnl lock held */ staticvoid vrf_map_unregister_dev(struct net_device *dev)
{ struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev);
u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users;
vrf_map_lock(vmap);
me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock;
/* no one will refer to this element anymore */
vrf_map_elem_free(me);
}
unlock:
vrf_map_unlock(vmap);
}
/* return the vrf device index associated with the table_id */ staticint vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id)
{ struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex;
vrf_map_lock(vmap);
if (!vmap->strict_mode) {
ifindex = -EPERM; goto unlock;
}
me = vrf_map_lookup_elem(vmap, table_id); if (!me) {
ifindex = -ENODEV; goto unlock;
}
ifindex = vrf_map_elem_get_vrf_ifindex(me);
unlock:
vrf_map_unlock(vmap);
return ifindex;
}
/* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue.
*/ staticbool qdisc_tx_is_default(conststruct net_device *dev)
{ struct netdev_queue *txq;
if (dev->num_tx_queues > 1) returnfalse;
txq = netdev_get_tx_queue(dev, 0);
return qdisc_txq_has_no_queue(txq);
}
/* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling.
*/ staticint vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst)
{ unsignedint len = skb->len;
skb_orphan(skb);
skb_dst_set(skb, dst);
/* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing
*/
skb->pkt_type = PACKET_LOOPBACK;
skb->protocol = eth_type_trans(skb, dev);
if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
dev_dstats_rx_add(dev, len); else
dev_dstats_rx_dropped(dev);
/* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path.
*/ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst);
skb_dst_set(skb, dst);
/* strip the ethernet header added for pass through VRF device */
__skb_pull(skb, skb_network_offset(skb));
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret)))
dev->stats.tx_errors++; else
ret = NET_XMIT_SUCCESS;
/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ staticint vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{ int err;
rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err;
skb_dst_drop(skb);
/* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path.
*/ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst);
skb_dst_set(skb, &rt->dst);
/* strip the ethernet header added for pass through VRF device */
__skb_pull(skb, skb_network_offset(skb));
if (!ip4h->saddr) {
ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
RT_SCOPE_LINK);
}
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret)))
vrf_dev->stats.tx_errors++; else
ret = NET_XMIT_SUCCESS;
/* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device.
*/ staticstruct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb)
{ struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6;
/* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown
*/ if (rt6) {
dst = &rt6->dst;
netdev_ref_replace(dst->dev, net->loopback_dev,
&dst->dev_tracker, GFP_KERNEL);
dst->dev = net->loopback_dev;
dst_release(dst);
}
}
staticint vrf_rt6_create(struct net_device *dev)
{ int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM;
/* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0;
vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out;
/* create a dst for routing packets out a VRF device */
rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out;
/* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len); if (!skb) {
dev->stats.tx_errors++; return -ENOMEM;
}
}
rcu_read_lock();
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret;
sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */
ret = neigh_output(neigh, skb, is_v6gw);
rcu_read_unlock(); return ret;
}
/* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device.
*/ staticstruct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb)
{ struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth;
/* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown
*/ if (rth) {
dst = &rth->dst;
netdev_ref_replace(dst->dev, net->loopback_dev,
&dst->dev_tracker, GFP_KERNEL);
dst->dev = net->loopback_dev;
dst_release(dst);
}
}
/* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf.
*/ if (port_dev == dev_net(dev)->loopback_dev) {
NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP;
}
port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err;
/* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header.
*/
err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS;
__skb_push(skb, ETH_HLEN);
eth = (struct ethhdr *)skb->data;
/* we set the ethernet destination and the source addresses to the * address of the VRF device.
*/
ether_addr_copy(eth->h_dest, vrf_dev->dev_addr);
ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
eth->h_proto = htons(proto);
/* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally.
*/
skb->protocol = eth->h_proto;
skb->pkt_type = PACKET_HOST;
skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
skb_pull_inline(skb, ETH_HLEN);
return 0;
}
/* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately.
*/ staticint vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev,
u16 proto, struct net_device *orig_dev)
{ if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0;
#if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start.
*/ staticbool ipv6_ndisc_frame(conststruct sk_buff *skb)
{ conststruct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false;
icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out;
switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT:
rc = true; break;
}
}
out: return rc;
}
staticstruct rt6_info *vrf_ip6_route_lookup(struct net *net, conststruct net_device *dev, struct flowi6 *fl6, int ifindex, conststruct sk_buff *skb, int flags)
{ struct net_vrf *vrf = netdev_priv(dev);
/* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex.
*/ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out;
/* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb
*/ if (skb->pkt_type == PACKET_LOOPBACK) {
skb->pkt_type = PACKET_HOST; goto out;
}
/* called with rcu lock held */ staticstruct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb,
u16 proto)
{ switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb);
}
return skb;
}
#if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function.
*/ staticstruct dst_entry *vrf_link_scope_lookup(conststruct net_device *dev, struct flowi6 *fl6)
{ struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt;
/* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense
*/ if (fl6->flowi6_oif == dev->ifindex) {
dst = &net->ipv6.ip6_null_entry->dst; return dst;
}
if (!ipv6_addr_any(&fl6->saddr))
flags |= RT6_LOOKUP_F_HAS_SADDR;
/* default to no qdisc; user can add if desired */
dev->priv_flags |= IFF_NO_QUEUE;
dev->priv_flags |= IFF_NO_RX_HANDLER;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
/* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking.
*/
dev->min_mtu = IPV6_MIN_MTU;
dev->max_mtu = IP6_MAX_MTU;
dev->mtu = dev->max_mtu;
/* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet.
*/
vrf->ifindex = dev->ifindex;
staticint vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode)
{ bool *cur_mode; int res = 0;
vrf_map_lock(vmap);
cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock;
if (*cur_mode) { /* disable strict mode */
*cur_mode = false;
} else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables.
*/
res = -EBUSY; goto unlock;
}
/* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table.
*/
*cur_mode = true;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.