// SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) output module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <Alan.Cox@linux.org> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Hirokazu Takahashi, <taka@valinux.co.jp> * * See ip_input.c for original log * * Fixes: * Alan Cox : Missing nonblock feature in ip_build_xmit. * Mike Kilburn : htons() missing in ip_build_xmit. * Bradford Johnson: Fix faulty handling of some frames when * no route is found. * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit * (in case if packet not accepted by * output firewall rules) * Mike McLagan : Routing by source * Alexey Kuznetsov: use new route cache * Andi Kleen: Fix broken PMTU recovery and remove * some redundant tests. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Replace ip_reply with ip_send_reply. * Andi Kleen : Split fast and slow ip_build_xmit path * for decreased register pressure on x86 * and more readability. * Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. * Hirokazu Takahashi: HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi: sendfile() on UDP works now.
*/
/* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing
*/
skb = l3mdev_ip_out(sk, skb); if (unlikely(!skb)) return 0;
/* OUTOCTETS should be counted after fragment */
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM;
}
if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb);
if (res != LWTUNNEL_XMIT_CONTINUE) return res;
}
rcu_read_lock();
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res;
sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */
res = neigh_output(neigh, skb, is_v6gw);
rcu_read_unlock(); return res;
}
rcu_read_unlock();
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return PTR_ERR(neigh);
}
staticint ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsignedint mtu)
{ struct sk_buff *segs, *nskb;
netdev_features_t features; int ret = 0;
/* common case: seglen is <= mtu
*/ if (skb_gso_validate_network_len(skb, mtu)) return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length exceeds the egress MTU. * * This can happen in several cases: * - Forwarding of a TCP GRO skb, when DF flag is not set. * - Forwarding of an skb that arrived on a virtualization interface * (virtio-net/vhost/tap) with TSO/GSO size set by other network * stack. * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an * interface with a smaller MTU. * - Arriving GRO skb (or GSO skb in a virtualized environment) that is * bridged to a NETIF_F_TSO tunnel stacked over an interface with an * insufficient MTU.
*/
features = netif_skb_features(skb);
BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb); return -ENOMEM;
}
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_CN:
do_cn = true;
fallthrough; case NET_XMIT_SUCCESS: break; default:
kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret;
}
/* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten, * see ipv4_pktinfo_prepare().
*/
new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb)); if (new_rt) {
new_rt->rt_iif = 0;
skb_dst_drop(skb);
skb_dst_set(skb, &new_rt->dst);
}
/* * If the indicated interface is up and running, send the packet.
*/
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
/* * Multicasts are looped back for other local users
*/
if (rt->rt_flags&RTCF_MULTICAST) { if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped by ip_mr_input in any case. Note, that local frames are looped back to be delivered to local recipients.
This check is duplicated in ip_mr_input at the moment.
*/
&&
((rt->rt_flags & RTCF_LOCAL) ||
!(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif
) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
if (ip_hdr(skb)->ttl == 0) {
kfree_skb(skb); return 0;
}
}
if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
ip_mc_finish_output);
}
/* Note: skb->sk can be different from sk, in case of tunnels */ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
__u8 tos)
{ struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res;
/* Skip all of this if the packet is already routed, * f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
fl4 = &fl->u.ip4;
rt = skb_rtable(skb); if (rt) goto packet_routed;
/* Make sure we can route this packet. */
rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) {
inet_sk_init_flowi4(inet, fl4);
/* sctp_v4_xmit() uses its own DSCP value */
fl4->flowi4_tos = tos & INET_DSCP_MASK;
/* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out.
*/
rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route;
sk_setup_caps(sk, &rt->dst);
}
skb_dst_set_noref(skb, &rt->dst);
packet_routed: if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route;
staticvoid ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, bool first_frag)
{ /* Copy the flags to each fragment. */
IPCB(to)->flags = IPCB(from)->flags;
/* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE * on the initial skb, so that all the following fragments * will inherit fixed options.
*/ if (first_frag)
ip_options_fragment(from);
}
/* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu)
len = state->mtu; /* IF: we are not sending up to and including the packet end
then align the next start on an eight byte boundary */ if (len < state->left) {
len &= ~7;
}
/* * Copy a block of the IP datagram.
*/ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
BUG();
state->left -= len;
/* * Fill in the new header fields.
*/
iph = ip_hdr(skb2);
iph->frag_off = htons((state->offset >> 3)); if (state->DF)
iph->frag_off |= htons(IP_DF);
/* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit
*/ if (state->left > 0 || state->not_last_frag)
iph->frag_off |= htons(IP_MF);
state->ptr += len;
state->offset += len;
iph->tot_len = htons(len + state->hlen);
ip_send_check(iph);
return skb2;
}
EXPORT_SYMBOL(ip_frag_next);
/* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending.
*/
/* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb))) goto fail;
/* * Point into the IP datagram header.
*/
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(sk, skb); if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
mtu = IPCB(skb)->frag_max_size;
/* * Setup starting values.
*/
hlen = iph->ihl * 4;
mtu = mtu - hlen; /* Size of data space */
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
/* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing * one, it is not prohibited. In this case fall back to copying. * * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment.
*/ if (skb_has_frag_list(skb)) { struct sk_buff *frag, *frag2; unsignedint first_len = skb_pagelen(skb);
/* * transhdrlen > 0 means that this is the first fragment and we wish * it won't be fragmented in the future.
*/ if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
(!(flags & MSG_MORE) || cork->gso_size) &&
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
/* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, * each of segments is IP fragment ready for sending to network after * adding appropriate IP header.
*/
if (!skb) goto alloc_new_skb;
while (length > 0) { /* Check if the remaining data fits into current packet. */
copy = mtu - skb->len; if (copy < length)
copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsignedint datalen; unsignedint fraglen; unsignedint fraggap; unsignedint alloclen, alloc_extra; unsignedint pagedlen; struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb; if (skb_prev)
fraggap = skb_prev->len - maxfraglen; else
fraggap = 0;
/* * If remaining data exceeds the mtu, * we know we need more fragment(s).
*/
datalen = length + fraggap; if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
fraglen = datalen + fragheaderlen;
pagedlen = 0;
/* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last.
*/ if (datalen == length + fraggap)
alloc_extra += rt->dst.trailer_len;
/* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
/* * Put the packet on the pending queue.
*/ if (!skb->destructor) {
skb->destructor = sock_wfree;
skb->sk = sk;
wmem_alloc_delta += skb->truesize;
}
__skb_queue_tail(queue, skb); continue;
}
if (copy > length)
copy = length;
if (!(rt->dst.dev->features&NETIF_F_SG) &&
skb_tailroom(skb) >= copy) { unsignedint off;
/* * ip_append_data() can make one large IP datagram from many pieces of * data. Each piece will be held on the socket until * ip_push_pending_frames() is called. Each piece can be a page or * non-page data. * * Not only UDP, other transport protocols - e.g. raw sockets - can use * this interface potentially. * * LATER: length must be adjusted by pad at tail, when it is required.
*/ int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsignedint flags)
{ struct inet_sock *inet = inet_sk(sk); int err;
if (flags&MSG_PROBE) return 0;
if (skb_queue_empty(&sk->sk_write_queue)) {
err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); if (err) return err;
} else {
transhdrlen = 0;
}
/* * Combined all pending IP fragments on the socket as one IP datagram * and push them out.
*/ struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork)
{ struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph;
u8 pmtudisc, ttl;
__be16 df = 0;
skb = __skb_dequeue(queue); if (!skb) goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
/* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
__skb_pull(tmp_skb, skb_network_header_len(skb));
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
skb->truesize += tmp_skb->truesize;
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
}
/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out.
*/
skb->ignore_df = ip_sk_ignore_df(sk);
/* DF bit is set when we want to see DF on outgoing frames. * If ignore_df is set too, we still allow to fragment this frame
* locally. */
pmtudisc = READ_ONCE(inet->pmtudisc); if (pmtudisc == IP_PMTUDISC_DO ||
pmtudisc == IP_PMTUDISC_PROBE ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
skb->priority = cork->priority;
skb->mark = cork->mark; if (sk_is_tcp(sk))
skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); else
skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid); /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount
*/
cork->dst = NULL;
skb_dst_set(skb, &rt->dst);
if (iph->protocol == IPPROTO_ICMP) {
u8 icmp_type;
/* For such sockets, transhdrlen is zero when do ip_append_data(), * so icmphdr does not in skb linear region and can not get icmp_type * by icmp_hdr(skb)->type.
*/ if (sk->sk_type == SOCK_RAW &&
!(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH))
icmp_type = fl4->fl4_icmp_type; else
icmp_type = icmp_hdr(skb)->type;
icmp_out_count(net, icmp_type);
}
ip_cork_release(cork);
out: return skb;
}
int ip_send_skb(struct net *net, struct sk_buff *skb)
{ int err;
err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0)
err = net_xmit_errno(err); if (err)
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
}
/* * Fetch data from kernel space and fill in checksum if needed.
*/ staticint ip_reply_glue_bits(void *dptr, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
__wsum csum;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.