// SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 output functions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_output.c * * Changes: * A.N.Kuznetsov : airthmetics in fragmentation. * extension headers are implemented. * route changes now work. * ip6_forward does not confuse sniffers. * etc. * * H. von Brand : Added missing #include <linux/string.h> * Imran Patel : frag id should be in NBO * Kazunori MIYAZAWA @USAGI * : add ip6_append_data and related functions * for datagram xmit
*/
/* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { /* idev stays alive because we hold rcu_read_lock(). */
skb = skb_expand_head(skb, hh_len); if (!skb) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); return -ENOMEM;
}
}
/* Do not check for IFF_ALLMULTI; multicast routing is not supported in any case.
*/ if (newskb)
NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
if (IS_ERR_OR_NULL(neigh)) { if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL;
}
}
sock_confirm_neigh(skb, neigh);
ret = neigh_output(neigh, skb, false); return ret;
}
staticint
ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, struct sk_buff *skb, unsignedint mtu)
{ struct sk_buff *segs, *nskb;
netdev_features_t features; int ret = 0;
/* Please see corresponding comment in ip_finish_output_gso * describing the cases where GSO segment length exceeds the * egress MTU.
*/
features = netif_skb_features(skb);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb); return -ENOMEM;
}
consume_skb(skb);
skb_list_walk_safe(segs, segs, nskb) { int err;
skb_mark_not_on_list(segs); /* Last GSO segment can be smaller than gso_size (and MTU). * Adding a fragment header would produce an "atomic fragment", * which is considered harmful (RFC-8021). Avoid that.
*/
err = segs->len > mtu ?
ip6_fragment(net, sk, segs, ip6_finish_output2) :
ip6_finish_output2(net, sk, segs); if (err && ret == 0)
ret = err;
}
mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
/* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing
*/
skb = l3mdev_ip6_out((struct sock *)sk, skb); if (unlikely(!skb)) {
ret = 0; goto unlock;
}
/* hooks should never assume socket lock is held. * we promote our socket to non const
*/
ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
net, (struct sock *)sk, skb, NULL, dev,
dst_output); goto unlock;
}
ret = -EMSGSIZE;
skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const
*/
ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
switch (icmp6->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: /* For reaction involving unicast neighbor discovery * message destined to the proxied address, pass it to * input function.
*/ return 1; default: break;
}
}
/* * The proxying router can't forward traffic sent to a link-local * address, so signal the sender and discard the packet. This * behavior is clarified by the MIPv6 specification.
*/ if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
dst_link_failure(skb); return -1;
}
/* * We DO NOT make any processing on * RA packets, pushing them to user level AS IS * without ane WARRANTY that application will be able * to interpret them. The reason is that we * cannot make anything clever here. * * We are not end-node, so that if packet contains * AH/ESP, we cannot make anything. * Defragmentation also would be mistake, RA packets * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK
*/ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { if (ip6_call_ra_chain(skb, ntohs(opt->ra))) return 0;
}
/* * check and decrement ttl
*/ if (hdr->hop_limit <= 1) {
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
/* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit * here by 1, as we do at the end of the * function too. * * But that would be incorrect, as proxying is * not forwarding. The ip6_input function * will handle this packet locally, and it * depends on the hop limit being unchanged. * * One example is the NDP hop limit, that * always has to stay 255, but other would be * similar checks around RA packets, where the * user can even change the desired limit.
*/ return ip6_input(skb);
} elseif (proxied < 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop;
}
}
if (!xfrm6_route_forward(skb)) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
SKB_DR_SET(reason, XFRM_POLICY); goto drop;
}
dst = skb_dst(skb);
dev = dst_dev(dst); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec.
*/ if (IP6CB(skb)->iif == dev->ifindex &&
opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; struct rt6_info *rt;
/* * incoming and outgoing devices are the same * send a redirect.
*/
/* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect)
*/ if (inet_peer_xrlim_allow(peer, 1*HZ))
ndisc_send_redirect(skb, target);
rcu_read_unlock();
} else { int addrtype = ipv6_addr_type(&hdr->saddr);
/* This check is security critical. */ if (addrtype == IPV6_ADDR_ANY ||
addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) goto error; if (addrtype & IPV6_ADDR_LINKLOCAL) {
icmpv6_send(skb, ICMPV6_DEST_UNREACH,
ICMPV6_NOT_NEIGHBOUR, 0); goto error;
}
}
len = state->left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu)
len = state->mtu; /* IF: we are not sending up to and including the packet end
then align the next start on an eight byte boundary */ if (len < state->left)
len &= ~7;
/* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket.
*/ if (unlikely(!skb->ignore_df && skb->len > mtu)) goto fail_toobig;
if (IP6CB(skb)->frag_max_size) { if (IP6CB(skb)->frag_max_size > mtu) goto fail_toobig;
/* don't send fragments larger than what we received */
mtu = IP6CB(skb)->frag_max_size; if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
}
if (np) {
u32 frag_size = READ_ONCE(np->frag_size);
if (frag_size && frag_size < mtu)
mtu = frag_size;
} if (mtu < hlen + sizeof(struct frag_hdr) + 8) goto fail_toobig;
mtu -= hlen + sizeof(struct frag_hdr);
if (dst->ops->family != AF_INET6) {
dst_release(dst); return NULL;
}
rt = dst_rt6_info(dst); /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, * and MSG_DONTROUTE --ANK (980726) * * 1. ip6_rt_check(): If route was host route, * check that cached destination is current. * If it is network route, we still may * check its validity using saved pointer * to the last used address: daddr_cache. * We do not want to save whole address now, * (because main consumer of this service * is tcp, which has not this problem), * so that the last trick works only on connected * sockets. * 2. oif also should be the same.
*/ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES
ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif
(fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
dst_release(dst);
dst = NULL;
}
out: return dst;
}
staticint ip6_dst_lookup_tail(struct net *net, conststruct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
{ #ifdef CONFIG_IPV6_OPTIMISTIC_DAD struct neighbour *n; struct rt6_info *rt; #endif int err; int flags = 0;
/* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the * ip6_route_output call _before_ ip6_route_get_saddr. * * In source specific routing (no src=any default route), * ip6_route_output will fail given src=any saddr, though, so * that's why we try it again later.
*/ if (ipv6_addr_any(&fl6->saddr)) { struct fib6_info *from; struct rt6_info *rt;
/* If we had an erroneous initial result, pretend it * never existed and let the SA-enabled version take * over.
*/ if ((*dst)->error) {
dst_release(*dst);
*dst = NULL;
}
if (fl6->flowi6_oif)
flags |= RT6_LOOKUP_F_IFACE;
}
if (!*dst)
*dst = ip6_route_output_flags(net, sk, fl6, flags);
err = (*dst)->error; if (err) goto out_err_release;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * Here if the dst entry we've looked up * has a neighbour entry that is in the INCOMPLETE * state and the src address from the flow is * marked as OPTIMISTIC, we release the found * dst entry and replace it instead with the * dst entry of the nexthop router
*/
rt = dst_rt6_info(*dst);
rcu_read_lock();
n = __ipv6_neigh_lookup_noref(rt->dst.dev,
rt6_nexthop(rt, &fl6->daddr));
err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
rcu_read_unlock();
if (err) { struct inet6_ifaddr *ifp; struct flowi6 fl_gw6; int redirect;
redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); if (ifp)
in6_ifa_put(ifp);
if (redirect) { /* * We need to get the dst entry for the * default router instead
*/
dst_release(*dst);
memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
*dst = ip6_route_output(net, sk, &fl_gw6);
err = (*dst)->error; if (err) goto out_err_release;
}
} #endif if (ipv6_addr_v4mapped(&fl6->saddr) &&
!(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
err = -EAFNOSUPPORT; goto out_err_release;
}
return 0;
out_err_release:
dst_release(*dst);
*dst = NULL;
if (err == -ENETUNREACH)
IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err;
}
/** * ip6_dst_lookup - perform route lookup on flow * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @dst: pointer to dst_entry * for result * @fl6: flow to lookup * * This function performs a route lookup on the given flow. * * It returns zero on success, or a standard errno code on error.
*/ int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
{
*dst = NULL; return ip6_dst_lookup_tail(net, sk, dst, fl6);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup);
/** * ip6_dst_lookup_flow - perform route lookup on flow with ipsec * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * * This function performs a route lookup on the given flow. * * It returns a valid dst pointer on success, or a pointer encoded * error code.
*/ struct dst_entry *ip6_dst_lookup_flow(struct net *net, conststruct sock *sk, struct flowi6 *fl6, conststruct in6_addr *final_dst)
{ struct dst_entry *dst = NULL; int err;
err = ip6_dst_lookup_tail(net, sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst)
fl6->daddr = *final_dst;
/** * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow * @sk: socket which provides the dst cache and route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * @connected: whether @sk is connected or not * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. * It will take the socket dst lock when operating on the dst cache. * As a result, this function can only be used in process context. * * In addition, for a connected socket, cache the dst in the socket * if the current cache is not valid. * * It returns a valid dst pointer on success, or a pointer encoded * error code.
*/ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, conststruct in6_addr *final_dst, bool connected)
{ struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
dst = ip6_sk_dst_check(sk, dst, fl6); if (dst) return dst;
staticvoid ip6_append_data_mtu(unsignedint *mtu, int *maxfraglen, unsignedint fragheaderlen, struct sk_buff *skb, struct rt6_info *rt, unsignedint orig_mtu)
{ if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { if (!skb) { /* first fragment, reserve header_len */
*mtu = orig_mtu - rt->dst.header_len;
} else { /* * this fragment is not first, the headers * space is regarded as data space.
*/
*mtu = orig_mtu;
}
*maxfraglen = ((*mtu - fragheaderlen) & ~7)
+ fragheaderlen - sizeof(struct frag_hdr);
}
}
/* callers pass dst together with a reference, set it first so * ip6_cork_release() can put it down even in case of an error.
*/
cork->base.dst = &rt->dst;
/* * setup for corking
*/ if (opt) { if (WARN_ON(v6_cork->opt)) return -EINVAL;
nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); if (unlikely(!nopt)) return -ENOBUFS;
/* CHECKSUM_PARTIAL only with no extension headers and when * we are not going to fragment
*/ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
length <= mtu - headersize &&
(!(flags & MSG_MORE) || cork->gso_size) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
/* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. * Otherwise, we need to reserve fragment header and * fragment alignment (= 8-15 octects, in total). * * Note that we may need to "move" the data from the tail * of the buffer to the new fragment when we split * the message. * * FIXME: It may be fragmented into multiple chunks * at once if non-fragmentable extension headers * are too large. * --yoshfuji
*/
cork->length += length; if (!skb) goto alloc_new_skb;
while (length > 0) { /* Check if the remaining data fits into current packet. */
copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; if (copy < length)
copy = maxfraglen - skb->len;
if (copy <= 0) { char *data; unsignedint datalen; unsignedint fraglen; unsignedint fraggap; unsignedint alloclen, alloc_extra; unsignedint pagedlen;
alloc_new_skb: /* There's no room in the current skb */ if (skb)
fraggap = skb->len - maxfraglen; else
fraggap = 0; /* update mtu and maxfraglen if necessary */ if (!skb || !skb_prev)
ip6_append_data_mtu(&mtu, &maxfraglen,
fragheaderlen, skb, rt,
orig_mtu);
skb_prev = skb;
/* * If remaining data exceeds the mtu, * we know we need more fragment(s).
*/
datalen = length + fraggap;
if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
pagedlen = 0;
/* We just reserve space for fragment header. * Note: this may be overallocation if the message * (without MSG_MORE) fits into the MTU.
*/
alloc_extra += sizeof(struct frag_hdr);
if (datalen != length + fraggap) { /* * this is not the last fragment, the trailer * space is regarded as data space.
*/
datalen += rt->dst.trailer_len;
}
fraglen = datalen + fragheaderlen;
copy = datalen - transhdrlen - fraggap - pagedlen; /* [!] NOTE: copy may be negative if pagedlen>0 * because then the equation may reduces to -fraggap.
*/ if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
err = -EINVAL; goto error;
} if (transhdrlen) {
skb = sock_alloc_send_skb(sk, alloclen,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
2 * sk->sk_sndbuf)
skb = alloc_skb(alloclen,
sk->sk_allocation); if (unlikely(!skb))
err = -ENOBUFS;
} if (!skb) goto error; /* * Fill in the control structures
*/
skb->protocol = htons(ETH_P_IPV6);
skb->ip_summed = csummode;
skb->csum = 0; /* reserve for fragmentation and ipsec header */
skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
dst_exthdrlen);
/* * Find where to start putting bytes
*/
data = skb_put(skb, fraglen - pagedlen);
skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
skb->transport_header = (skb->network_header +
fragheaderlen); if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
} if (copy > 0 &&
INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
from, data + transhdrlen, offset,
copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb); goto error;
} elseif (flags & MSG_SPLICE_PAGES) {
copy = 0;
}
/* Only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
/* * Put the packet on the pending queue
*/ if (!skb->destructor) {
skb->destructor = sock_wfree;
skb->sk = sk;
wmem_alloc_delta += skb->truesize;
}
__skb_queue_tail(queue, skb); continue;
}
if (copy > length)
copy = length;
if (!(rt->dst.dev->features&NETIF_F_SG) &&
skb_tailroom(skb) >= copy) { unsignedint off;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.