Quelle tcp_output.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system.  INET is implemented using the  BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*/

/*
* Changes: Pedro Roque : Retransmit queue handled by TCP.
* : Fragmentation on mtu decrease
* : Segment collapse on retransmit
* : AF independence
*
* Linus Torvalds : send_delayed_ack
* David S. Miller : Charge memory using the right skb
* during syn/ack processing.
* David S. Miller : Output engine completely rewritten.
* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
* Cacophonix Gaul : draft-minshall-nagle-01
* J Hadi Salim : ECN support
*
*/

#define pr_fmt(fmt) "TCP: " fmt

#include <net/tcp.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>

#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/static_key.h>
#include <linux/skbuff_ref.h>

#include <trace/events/tcp.h>

/* Refresh clocks of a TCP socket,
* ensuring monotically increasing values.
*/
void tcp_mstamp_refresh(struct tcp_sock *tp)
{
u64 val = tcp_clock_ns();

tp->tcp_clock_cache = val;
tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
}

static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
      int push_one, gfp_t gfp);

/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;

WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);

__skb_unlink(skb, &sk->sk_write_queue);
tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);

if (tp->highest_sack == NULL)
  tp->highest_sack = skb;

tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
  tcp_rearm_rto(sk);

NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
        tcp_skb_pcount(skb));
tcp_check_space(sk);
}

/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
* window scaling factor due to loss of precision.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
* invalid. OK, let's make this for now:
*/
static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);

if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
     (tp->rx_opt.wscale_ok &&
      ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
  return tp->snd_nxt;
else
  return tcp_wnd_end(tp);
}

/* Calculate mss to advertise in SYN segment.
* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
*
* 1. It is independent of path mtu.
* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
*    attached devices, because some buggy hosts are confused by
*    large MSS.
* 4. We do not make 3, we advertise MSS, calculated from first
*    hop device mtu, but allow to raise it to ip_rt_min_advmss.
*    This may be overridden via information stored in routing table.
* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
*    probably even Jumbo".
*/
static __u16 tcp_advertise_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct dst_entry *dst = __sk_dst_get(sk);
int mss = tp->advmss;

if (dst) {
  unsigned int metric = dst_metric_advmss(dst);

  if (metric < mss) {
   mss = metric;
   tp->advmss = mss;
  }
}

return (__u16)mss;
}

/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
* This is the first part of cwnd validation mechanism.
*/
void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 cwnd = tcp_snd_cwnd(tp);

tcp_ca_event(sk, CA_EVENT_CWND_RESTART);

tp->snd_ssthresh = tcp_current_ssthresh(sk);
restart_cwnd = min(restart_cwnd, cwnd);

while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
  cwnd >>= 1;
tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_cwnd_used = 0;
}

/* Congestion state accounting after a packet has been sent. */
static void tcp_event_data_sent(struct tcp_sock *tp,
    struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_jiffies32;

if (tcp_packets_in_flight(tp) == 0)
  tcp_ca_event(sk, CA_EVENT_TX_START);

tp->lsndtime = now;

/* If it is a reply for ato after last received
* packet, increase pingpong count.
*/
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
  inet_csk_inc_pingpong_cnt(sk);
}

/* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);

if (unlikely(tp->compressed_ack)) {
  NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
         tp->compressed_ack);
  tp->compressed_ack = 0;
  if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
   __sock_put(sk);
}

if (unlikely(rcv_nxt != tp->rcv_nxt))
  return;  /* Special ACK sent by DCTCP to reflect ECN */
tcp_dec_quickack_mode(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}

/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
* NOTE: for smooth operation initial space offering should
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
          __u32 *rcv_wnd, __u32 *__window_clamp,
          int wscale_ok, __u8 *rcv_wscale,
          __u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space);
u32 window_clamp = READ_ONCE(*__window_clamp);

/* If no clamp set the clamp to the max possible scaled window */
if (window_clamp == 0)
  window_clamp = (U16_MAX << TCP_MAX_WSCALE);
space = min(window_clamp, space);

/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
  space = rounddown(space, mss);

/* NOTE: offering an initial window larger than 32767
* will break some buggy TCP stacks. If the admin tells us
* it is likely we could be speaking with such a buggy stack
* we will truncate our initial window offering to 32K-1
* unless the remote has sent us a window scaling option,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
  (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
  (*rcv_wnd) = space;

if (init_rcv_wnd)
  *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);

*rcv_wscale = 0;
if (wscale_ok) {
  /* Set window scaling on max possible window */
  space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
  space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
  space = min_t(u32, space, window_clamp);
  *rcv_wscale = clamp_t(int, ilog2(space) - 15,
          0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
WRITE_ONCE(*__window_clamp,
     min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
}
EXPORT_IPV6_MOD(tcp_select_initial_window);

/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied.  The return
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
u32 old_win = tp->rcv_wnd;
u32 cur_win, new_win;

/* Make the window 0 if we failed to queue the data because we
* are out of memory.
*/
if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) {
  tp->pred_flags = 0;
  tp->rcv_wnd = 0;
  tp->rcv_wup = tp->rcv_nxt;
  return 0;
}

cur_win = tcp_receive_window(tp);
new_win = __tcp_select_window(sk);
if (new_win < cur_win) {
  /* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
* we will not be able to advertise a zero
* window in time.  --DaveM
*
* Relax Will Robinson.
*/
  if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) {
   /* Never shrink the offered window */
   if (new_win == 0)
    NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
   new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
  }
}

tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;

/* Make sure we do not exceed the maximum possible
* scaled window.
*/
if (!tp->rx_opt.rcv_wscale &&
     READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
  new_win = min(new_win, MAX_TCP_WINDOW);
else
  new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));

/* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale;

/* If we advertise zero window, disable fast path. */
if (new_win == 0) {
  tp->pred_flags = 0;
  if (old_win)
   NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
} else if (old_win == 0) {
  NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
}

return new_win;
}

/* Packet ECN state for a SYN-ACK */
static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);

TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (tcp_ecn_disabled(tp))
  TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
else if (tcp_ca_needs_ecn(sk) ||
   tcp_bpf_ca_needs_ecn(sk))
  INET_ECN_xmit(sk);
}

/* Packet ECN state for a SYN.  */
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
  tcp_ca_needs_ecn(sk) || bpf_needs_ecn;

if (!use_ecn) {
  const struct dst_entry *dst = __sk_dst_get(sk);

  if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
   use_ecn = true;
}

tp->ecn_flags = 0;

if (use_ecn) {
  TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
  tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
  if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
   INET_ECN_xmit(sk);
}
}

static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
  /* tp->ecn_flags are cleared at a later point in time when
* SYN ACK is ultimatively being received.
*/
  TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
}

static void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
if (inet_rsk(req)->ecn_ok)
  th->ece = 1;
}

/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
    struct tcphdr *th, int tcp_header_len)
{
struct tcp_sock *tp = tcp_sk(sk);

if (tcp_ecn_mode_rfc3168(tp)) {
  /* Not-retransmitted data segment: set ECT and inject CWR. */
  if (skb->len != tcp_header_len &&
      !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
   INET_ECN_xmit(sk);
   if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
    tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
    th->cwr = 1;
    skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
   }
  } else if (!tcp_ca_needs_ecn(sk)) {
   /* ACK or retransmitted segment: clear ECT|CE */
   INET_ECN_dontxmit(sk);
  }
  if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
   th->ece = 1;
}
}

/* Constructs common control bits of non-data skb. If SYN/FIN is present,
* auto increment end seqno.
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;

TCP_SKB_CB(skb)->tcp_flags = flags;

tcp_skb_pcount_set(skb, 1);

TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
  seq++;
TCP_SKB_CB(skb)->end_seq = seq;
}

static inline bool tcp_urg_mode(const struct tcp_sock *tp)
{
return tp->snd_una != tp->snd_up;
}

#define OPTION_SACK_ADVERTISE BIT(0)
#define OPTION_TS  BIT(1)
#define OPTION_MD5  BIT(2)
#define OPTION_WSCALE  BIT(3)
#define OPTION_FAST_OPEN_COOKIE BIT(8)
#define OPTION_SMC  BIT(9)
#define OPTION_MPTCP  BIT(10)
#define OPTION_AO  BIT(11)

static void smc_options_write(__be32 *ptr, u16 *options)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
  if (unlikely(OPTION_SMC & *options)) {
   *ptr++ = htonl((TCPOPT_NOP  << 24) |
           (TCPOPT_NOP  << 16) |
           (TCPOPT_EXP <<  8) |
           (TCPOLEN_EXP_SMC_BASE));
   *ptr++ = htonl(TCPOPT_SMC_MAGIC);
  }
}
#endif
}

struct tcp_out_options {
u16 options;  /* bit field of OPTION_* */
u16 mss;  /* 0 to disable */
u8 ws;   /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size;  /* bytes in hash_location */
u8 bpf_opt_len;  /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
struct mptcp_out_options mptcp;
};

static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
    struct tcp_sock *tp,
    struct tcp_out_options *opts)
{
#if IS_ENABLED(CONFIG_MPTCP)
if (unlikely(OPTION_MPTCP & opts->options))
  mptcp_write_options(th, ptr, tp, &opts->mptcp);
#endif
}

#ifdef CONFIG_CGROUP_BPF
static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
     enum tcp_synack_type synack_type)
{
if (unlikely(!skb))
  return BPF_WRITE_HDR_TCP_CURRENT_MSS;

if (unlikely(synack_type == TCP_SYNACK_COOKIE))
  return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;

return 0;
}

/* req, syn_skb and synack_type are used when writing synack */
static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
      struct request_sock *req,
      struct sk_buff *syn_skb,
      enum tcp_synack_type synack_type,
      struct tcp_out_options *opts,
      unsigned int *remaining)
{
struct bpf_sock_ops_kern sock_ops;
int err;

if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
        BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
     !*remaining)
  return;

/* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */

/* init sock_ops */
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));

sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;

if (req) {
  /* The listen "sk" cannot be passed here because
* it is not locked.  It would not make too much
* sense to do bpf_setsockopt(listen_sk) based
* on individual connection request also.
*
* Thus, "req" is passed here and the cgroup-bpf-progs
* of the listen "sk" will be run.
*
* "req" is also used here for fastopen even the "sk" here is
* a fullsock "child" sk.  It is to keep the behavior
* consistent between fastopen and non-fastopen on
* the bpf programming side.
*/
  sock_ops.sk = (struct sock *)req;
  sock_ops.syn_skb = syn_skb;
} else {
  sock_owned_by_me(sk);

  sock_ops.is_fullsock = 1;
  sock_ops.is_locked_tcp_sock = 1;
  sock_ops.sk = sk;
}

sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
sock_ops.remaining_opt_len = *remaining;
/* tcp_current_mss() does not pass a skb */
if (skb)
  bpf_skops_init_skb(&sock_ops, skb, 0);

err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);

if (err || sock_ops.remaining_opt_len == *remaining)
  return;

opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
/* round up to 4 bytes */
opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;

*remaining -= opts->bpf_opt_len;
}

static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
        struct request_sock *req,
        struct sk_buff *syn_skb,
        enum tcp_synack_type synack_type,
        struct tcp_out_options *opts)
{
u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
struct bpf_sock_ops_kern sock_ops;
int err;

if (likely(!max_opt_len))
  return;

memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));

sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;

if (req) {
  sock_ops.sk = (struct sock *)req;
  sock_ops.syn_skb = syn_skb;
} else {
  sock_owned_by_me(sk);

  sock_ops.is_fullsock = 1;
  sock_ops.is_locked_tcp_sock = 1;
  sock_ops.sk = sk;
}

sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
sock_ops.remaining_opt_len = max_opt_len;
first_opt_off = tcp_hdrlen(skb) - max_opt_len;
bpf_skops_init_skb(&sock_ops, skb, first_opt_off);

err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);

if (err)
  nr_written = 0;
else
  nr_written = max_opt_len - sock_ops.remaining_opt_len;

if (nr_written < max_opt_len)
  memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
         max_opt_len - nr_written);
}
#else
static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
      struct request_sock *req,
      struct sk_buff *syn_skb,
      enum tcp_synack_type synack_type,
      struct tcp_out_options *opts,
      unsigned int *remaining)
{
}

static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
        struct request_sock *req,
        struct sk_buff *syn_skb,
        enum tcp_synack_type synack_type,
        struct tcp_out_options *opts)
{
}
#endif

static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
          const struct tcp_request_sock *tcprsk,
          struct tcp_out_options *opts,
          struct tcp_key *key, __be32 *ptr)
{
#ifdef CONFIG_TCP_AO
u8 maclen = tcp_ao_maclen(key->ao_key);

if (tcprsk) {
  u8 aolen = maclen + sizeof(struct tcp_ao_hdr);

  *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) |
          (tcprsk->ao_keyid << 8) |
          (tcprsk->ao_rcv_next));
} else {
  struct tcp_ao_key *rnext_key;
  struct tcp_ao_info *ao_info;

  ao_info = rcu_dereference_check(tp->ao_info,
   lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
  rnext_key = READ_ONCE(ao_info->rnext_key);
  if (WARN_ON_ONCE(!rnext_key))
   return ptr;
  *ptr++ = htonl((TCPOPT_AO << 24) |
          (tcp_ao_len(key->ao_key) << 16) |
          (key->ao_key->sndid << 8) |
          (rnext_key->rcvid));
}
opts->hash_location = (__u8 *)ptr;
ptr += maclen / sizeof(*ptr);
if (unlikely(maclen % sizeof(*ptr))) {
  memset(ptr, TCPOPT_NOP, sizeof(*ptr));
  ptr++;
}
#endif
return ptr;
}

/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
* TCP options, we learned this through the hard way, so be careful here.
* Luckily we can at least blame others for their non-compliance but from
* inter-operability perspective it seems that we're somewhat stuck with
* the ordering which we have been using if we want to keep working with
* those broken things (not that it currently hurts anybody as there isn't
* particular reason why the ordering would need to be changed).
*
* At least SACK_PERM as the first option is known to lead to a disaster
* (but it may well be that other scenarios fail similarly).
*/
static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
         const struct tcp_request_sock *tcprsk,
         struct tcp_out_options *opts,
         struct tcp_key *key)
{
__be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */

if (tcp_key_is_md5(key)) {
  *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
          (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
  /* overload cookie hash location */
  opts->hash_location = (__u8 *)ptr;
  ptr += 4;
} else if (tcp_key_is_ao(key)) {
  ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr);
}
if (unlikely(opts->mss)) {
  *ptr++ = htonl((TCPOPT_MSS << 24) |
          (TCPOLEN_MSS << 16) |
          opts->mss);
}

if (likely(OPTION_TS & options)) {
  if (unlikely(OPTION_SACK_ADVERTISE & options)) {
   *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
           (TCPOLEN_SACK_PERM << 16) |
           (TCPOPT_TIMESTAMP << 8) |
           TCPOLEN_TIMESTAMP);
   options &= ~OPTION_SACK_ADVERTISE;
  } else {
   *ptr++ = htonl((TCPOPT_NOP << 24) |
           (TCPOPT_NOP << 16) |
           (TCPOPT_TIMESTAMP << 8) |
           TCPOLEN_TIMESTAMP);
  }
  *ptr++ = htonl(opts->tsval);
  *ptr++ = htonl(opts->tsecr);
}

if (unlikely(OPTION_SACK_ADVERTISE & options)) {
  *ptr++ = htonl((TCPOPT_NOP << 24) |
          (TCPOPT_NOP << 16) |
          (TCPOPT_SACK_PERM << 8) |
          TCPOLEN_SACK_PERM);
}

if (unlikely(OPTION_WSCALE & options)) {
  *ptr++ = htonl((TCPOPT_NOP << 24) |
          (TCPOPT_WINDOW << 16) |
          (TCPOLEN_WINDOW << 8) |
          opts->ws);
}

if (unlikely(opts->num_sack_blocks)) {
  struct tcp_sack_block *sp = tp->rx_opt.dsack ?
   tp->duplicate_sack : tp->selective_acks;
  int this_sack;

  *ptr++ = htonl((TCPOPT_NOP  << 24) |
          (TCPOPT_NOP  << 16) |
          (TCPOPT_SACK <<  8) |
          (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
           TCPOLEN_SACK_PERBLOCK)));

  for (this_sack = 0; this_sack < opts->num_sack_blocks;
       ++this_sack) {
   *ptr++ = htonl(sp[this_sack].start_seq);
   *ptr++ = htonl(sp[this_sack].end_seq);
  }

  tp->rx_opt.dsack = 0;
}

if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
  struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
  u8 *p = (u8 *)ptr;
  u32 len; /* Fast Open option length */

  if (foc->exp) {
   len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
   *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
         TCPOPT_FASTOPEN_MAGIC);
   p += TCPOLEN_EXP_FASTOPEN_BASE;
  } else {
   len = TCPOLEN_FASTOPEN_BASE + foc->len;
   *p++ = TCPOPT_FASTOPEN;
   *p++ = len;
  }

  memcpy(p, foc->val, foc->len);
  if ((len & 3) == 2) {
   p[foc->len] = TCPOPT_NOP;
   p[foc->len + 1] = TCPOPT_NOP;
  }
  ptr += (len + 3) >> 2;
}

smc_options_write(ptr, &options);

mptcp_options_write(th, ptr, tp, opts);
}

static void smc_set_option(const struct tcp_sock *tp,
      struct tcp_out_options *opts,
      unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
  if (tp->syn_smc) {
   if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
    opts->options |= OPTION_SMC;
    *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
   }
  }
}
#endif
}

static void smc_set_option_cond(const struct tcp_sock *tp,
    const struct inet_request_sock *ireq,
    struct tcp_out_options *opts,
    unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
  if (tp->syn_smc && ireq->smc_ok) {
   if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
    opts->options |= OPTION_SMC;
    *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
   }
  }
}
#endif
}

static void mptcp_set_option_cond(const struct request_sock *req,
      struct tcp_out_options *opts,
      unsigned int *remaining)
{
if (rsk_is_mptcp(req)) {
  unsigned int size;

  if (mptcp_synack_options(req, &size, &opts->mptcp)) {
   if (*remaining >= size) {
    opts->options |= OPTION_MPTCP;
    *remaining -= size;
   }
  }
}
}

/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
    struct tcp_out_options *opts,
    struct tcp_key *key)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
bool timestamps;

/* Better than switch (key.type) as it has static branches */
if (tcp_key_is_md5(key)) {
  timestamps = false;
  opts->options |= OPTION_MD5;
  remaining -= TCPOLEN_MD5SIG_ALIGNED;
} else {
  timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
  if (tcp_key_is_ao(key)) {
   opts->options |= OPTION_AO;
   remaining -= tcp_ao_len_aligned(key->ao_key);
  }
}

/* We always get an MSS option.  The option bytes which will be seen in
* normal data packets should timestamps be used, must be in the MSS
* advertised.  But we subtract them from tp->mss_cache so that
* calculations in tcp_sendmsg are simpler etc.  So account for this
* fact here if necessary.  If we don't do this correctly, as a
* receiver we won't recognize data packets as being full sized when we
* should, and thus we won't abide by the delayed ACK rules correctly.
* SACKs don't matter, we never delay an ACK when we have any of those
* going out.  */
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;

if (likely(timestamps)) {
  opts->options |= OPTION_TS;
  opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
  opts->tsecr = tp->rx_opt.ts_recent;
  remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
  opts->ws = tp->rx_opt.rcv_wscale;
  opts->options |= OPTION_WSCALE;
  remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
  opts->options |= OPTION_SACK_ADVERTISE;
  if (unlikely(!(OPTION_TS & opts->options)))
   remaining -= TCPOLEN_SACKPERM_ALIGNED;
}

if (fastopen && fastopen->cookie.len >= 0) {
  u32 need = fastopen->cookie.len;

  need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
            TCPOLEN_FASTOPEN_BASE;
  need = (need + 3) & ~3U;  /* Align to 32 bits */
  if (remaining >= need) {
   opts->options |= OPTION_FAST_OPEN_COOKIE;
   opts->fastopen_cookie = &fastopen->cookie;
   remaining -= need;
   tp->syn_fastopen = 1;
   tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
  }
}

smc_set_option(tp, opts, &remaining);

if (sk_is_mptcp(sk)) {
  unsigned int size;

  if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
   if (remaining >= size) {
    opts->options |= OPTION_MPTCP;
    remaining -= size;
   }
  }
}

bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);

return MAX_TCP_OPTION_SPACE - remaining;
}

/* Set up TCP options for SYN-ACKs. */
static unsigned int tcp_synack_options(const struct sock *sk,
           struct request_sock *req,
           unsigned int mss, struct sk_buff *skb,
           struct tcp_out_options *opts,
           const struct tcp_key *key,
           struct tcp_fastopen_cookie *foc,
           enum tcp_synack_type synack_type,
           struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;

if (tcp_key_is_md5(key)) {
  opts->options |= OPTION_MD5;
  remaining -= TCPOLEN_MD5SIG_ALIGNED;

  /* We can't fit any SACK blocks in a packet with MD5 + TS
* options. There was discussion about disabling SACK
* rather than TS in order to fit in better with old,
* buggy kernels, but that was deemed to be unnecessary.
*/
  if (synack_type != TCP_SYNACK_COOKIE)
   ireq->tstamp_ok &= !ireq->sack_ok;
} else if (tcp_key_is_ao(key)) {
  opts->options |= OPTION_AO;
  remaining -= tcp_ao_len_aligned(key->ao_key);
  ireq->tstamp_ok &= !ireq->sack_ok;
}

/* We always send an MSS option. */
opts->mss = mss;
remaining -= TCPOLEN_MSS_ALIGNED;

if (likely(ireq->wscale_ok)) {
  opts->ws = ireq->rcv_wscale;
  opts->options |= OPTION_WSCALE;
  remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(ireq->tstamp_ok)) {
  opts->options |= OPTION_TS;
  opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
         tcp_rsk(req)->ts_off;
  if (!tcp_rsk(req)->snt_tsval_first) {
   if (!opts->tsval)
    opts->tsval = ~0U;
   tcp_rsk(req)->snt_tsval_first = opts->tsval;
  }
  WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval);
  opts->tsecr = req->ts_recent;
  remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
  opts->options |= OPTION_SACK_ADVERTISE;
  if (unlikely(!ireq->tstamp_ok))
   remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
if (foc != NULL && foc->len >= 0) {
  u32 need = foc->len;

  need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
       TCPOLEN_FASTOPEN_BASE;
  need = (need + 3) & ~3U;  /* Align to 32 bits */
  if (remaining >= need) {
   opts->options |= OPTION_FAST_OPEN_COOKIE;
   opts->fastopen_cookie = foc;
   remaining -= need;
  }
}

mptcp_set_option_cond(req, opts, &remaining);

smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);

bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
         synack_type, opts, &remaining);

return MAX_TCP_OPTION_SPACE - remaining;
}

/* Compute TCP options for ESTABLISHED sockets. This is not the
* final wire format yet.
*/
static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
     struct tcp_out_options *opts,
     struct tcp_key *key)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int size = 0;
unsigned int eff_sacks;

opts->options = 0;

/* Better than switch (key.type) as it has static branches */
if (tcp_key_is_md5(key)) {
  opts->options |= OPTION_MD5;
  size += TCPOLEN_MD5SIG_ALIGNED;
} else if (tcp_key_is_ao(key)) {
  opts->options |= OPTION_AO;
  size += tcp_ao_len_aligned(key->ao_key);
}

if (likely(tp->rx_opt.tstamp_ok)) {
  opts->options |= OPTION_TS;
  opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
    tp->tsoffset : 0;
  opts->tsecr = tp->rx_opt.ts_recent;
  size += TCPOLEN_TSTAMP_ALIGNED;
}

/* MPTCP options have precedence over SACK for the limited TCP
* option space because a MPTCP connection would be forced to
* fall back to regular TCP if a required multipath option is
* missing. SACK still gets a chance to use whatever space is
* left.
*/
if (sk_is_mptcp(sk)) {
  unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
  unsigned int opt_size = 0;

  if (mptcp_established_options(sk, skb, &opt_size, remaining,
           &opts->mptcp)) {
   opts->options |= OPTION_MPTCP;
   size += opt_size;
  }
}

eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
  const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
  if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
      TCPOLEN_SACK_PERBLOCK))
   return size;

  opts->num_sack_blocks =
   min_t(unsigned int, eff_sacks,
         (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
         TCPOLEN_SACK_PERBLOCK);

  size += TCPOLEN_SACK_BASE_ALIGNED +
   opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}

if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
         BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
  unsigned int remaining = MAX_TCP_OPTION_SPACE - size;

  bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);

  size = MAX_TCP_OPTION_SPACE - remaining;
}

return size;
}

/* TCP SMALL QUEUES (TSQ)
*
* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
* to reduce RTT and bufferbloat.
* We do this using a special skb destructor (tcp_wfree).
*
* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
* needs to be reallocated in a driver.
* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
*
* Since transmit from skb destructor is forbidden, we use a BH work item
* to process all sockets that eventually need to send more skbs.
* We use one work item per cpu, with its own queue of sockets.
*/
struct tsq_work {
struct work_struct work;
struct list_head head; /* queue of tcp sockets */
};
static DEFINE_PER_CPU(struct tsq_work, tsq_work);

static void tcp_tsq_write(struct sock *sk)
{
if ((1 << sk->sk_state) &
     (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
      TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
  struct tcp_sock *tp = tcp_sk(sk);

  if (tp->lost_out > tp->retrans_out &&
      tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
   tcp_mstamp_refresh(tp);
   tcp_xmit_retransmit_queue(sk);
  }

  tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
          0, GFP_ATOMIC);
}
}

static void tcp_tsq_handler(struct sock *sk)
{
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
  tcp_tsq_write(sk);
else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
  sock_hold(sk);
bh_unlock_sock(sk);
}
/*
* One work item per cpu tries to send more skbs.
* We run in BH context but need to disable irqs when
* transferring tsq->head because tcp_wfree() might
* interrupt us (non NAPI drivers)
*/
static void tcp_tsq_workfn(struct work_struct *work)
{
struct tsq_work *tsq = container_of(work, struct tsq_work, work);
LIST_HEAD(list);
unsigned long flags;
struct list_head *q, *n;
struct tcp_sock *tp;
struct sock *sk;

local_irq_save(flags);
list_splice_init(&tsq->head, &list);
local_irq_restore(flags);

list_for_each_safe(q, n, &list) {
  tp = list_entry(q, struct tcp_sock, tsq_node);
  list_del(&tp->tsq_node);

  sk = (struct sock *)tp;
  smp_mb__before_atomic();
  clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);

  tcp_tsq_handler(sk);
  sk_free(sk);
}
}

#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |  \
     TCPF_WRITE_TIMER_DEFERRED | \
     TCPF_DELACK_TIMER_DEFERRED | \
     TCPF_MTU_REDUCED_DEFERRED | \
     TCPF_ACK_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
*
* called from release_sock() to perform protocol dependent
* actions before socket release.
*/
void tcp_release_cb(struct sock *sk)
{
unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
unsigned long nflags;

/* perform an atomic operation only if at least one flag is set */
do {
  if (!(flags & TCP_DEFERRED_ALL))
   return;
  nflags = flags & ~TCP_DEFERRED_ALL;
} while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));

if (flags & TCPF_TSQ_DEFERRED) {
  tcp_tsq_write(sk);
  __sock_put(sk);
}

if (flags & TCPF_WRITE_TIMER_DEFERRED) {
  tcp_write_timer_handler(sk);
  __sock_put(sk);
}
if (flags & TCPF_DELACK_TIMER_DEFERRED) {
  tcp_delack_timer_handler(sk);
  __sock_put(sk);
}
if (flags & TCPF_MTU_REDUCED_DEFERRED) {
  inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
  __sock_put(sk);
}
if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
  tcp_send_ack(sk);
}
EXPORT_IPV6_MOD(tcp_release_cb);

void __init tcp_tsq_work_init(void)
{
int i;

for_each_possible_cpu(i) {
  struct tsq_work *tsq = &per_cpu(tsq_work, i);

  INIT_LIST_HEAD(&tsq->head);
  INIT_WORK(&tsq->work, tcp_tsq_workfn);
}
}

/*
* Write buffer destructor automatically called from kfree_skb.
* We can't xmit new skbs from this context, as we might already
* hold qdisc lock.
*/
void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nval, oval;
struct tsq_work *tsq;
bool empty;

/* Keep one reference on sk_wmem_alloc.
* Will be released by sk_free() from here or tcp_tsq_workfn()
*/
WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));

/* If this softirq is serviced by ksoftirqd, we are likely under stress.
* Wait until our queues (qdisc + devices) are drained.
* This gives :
* - less callbacks to tcp_write_xmit(), reducing stress (batches)
* - chance for incoming ACK (processed by another cpu maybe)
*   to migrate this flow (skb->ooo_okay will be eventually set)
*/
if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
  goto out;

oval = smp_load_acquire(&sk->sk_tsq_flags);
do {
  if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
   goto out;

  nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
} while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));

/* queue this socket to BH workqueue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_work);
empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
if (empty)
  queue_work(system_bh_wq, &tsq->work);
local_irq_restore(flags);
return;
out:
sk_free(sk);
}

/* Note: Called under soft irq.
* We can call TCP stack right away, unless socket is owned by user.
*/
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
{
struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
struct sock *sk = (struct sock *)tp;

tcp_tsq_handler(sk);
sock_put(sk);

return HRTIMER_NORESTART;
}

static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
          u64 prior_wstamp)
{
struct tcp_sock *tp = tcp_sk(sk);

if (sk->sk_pacing_status != SK_PACING_NONE) {
  unsigned long rate = READ_ONCE(sk->sk_pacing_rate);

  /* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets,
* this is a minor annoyance.
*/
  if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
   u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
   u64 credit = tp->tcp_wstamp_ns - prior_wstamp;

   /* take into account OS jitter */
   len_ns -= min_t(u64, len_ns / 2, credit);
   tp->tcp_wstamp_ns += len_ns;
  }
}
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}

INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));

/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg().  This is used by both the initial
* transmission and possible later retransmissions.
* All SKB's seen here are completely headerless.  It is our
* job to build the TCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
         int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct sk_buff *oskb = NULL;
struct tcp_key key;
struct tcphdr *th;
u64 prior_wstamp;
int err;

BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
if (clone_it) {
  oskb = skb;

  tcp_skb_tsorted_save(oskb) {
   if (unlikely(skb_cloned(oskb)))
    skb = pskb_copy(oskb, gfp_mask);
   else
    skb = skb_clone(oskb, gfp_mask);
  } tcp_skb_tsorted_restore(oskb);

  if (unlikely(!skb))
   return -ENOBUFS;
  /* retransmit skbs might have a non zero value in skb->dev
* because skb->dev is aliased with skb->rbnode.rb_left
*/
  skb->dev = NULL;
}

inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));

tcp_get_current_key(sk, &key);
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
  tcp_options_size = tcp_syn_options(sk, skb, &opts, &key);
} else {
  tcp_options_size = tcp_established_options(sk, skb, &opts, &key);
  /* Force a PSH flag on all (GSO) packets to expedite GRO flush
* at receiver : This slightly improve GRO performance.
* Note that we do not force the PSH flag for non GSO packets,
* because they might be sent under high congestion events,
* and in this case it is better to delay the delivery of 1-MSS
* packets and thus the corresponding ACK packet that would
* release the following packet.
*/
  if (tcp_skb_pcount(skb) > 1)
   tcb->tcp_flags |= TCPHDR_PSH;
}
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);

/* We set skb->ooo_okay to one if this packet can select
* a different TX queue than prior packets of this flow,
* to avoid self inflicted reorders.
* The 'other' queue decision is based on current cpu number
* if XPS is enabled, or sk->sk_txhash otherwise.
* We can switch to another (and better) queue if:
* 1) No packet with payload is in qdisc/device queues.
*    Delays in TX completion can defeat the test
*    even if packets were already sent.
* 2) Or rtx queue is empty.
*    This mitigates above case if ACK packets for
*    all prior packets were already processed.
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) ||
   tcp_rtx_queue_empty(sk);

/* If we had to use memory reserve to allocate this skb,
* this might cause drops if packet is looped back :
* Other socket might not have SOCK_MEMALLOC.
* Packets not looped back do not care about pfmemalloc.
*/
skb->pfmemalloc = 0;

skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);

skb_orphan(skb);
skb->sk = sk;
skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);

skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));

/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source  = inet->inet_sport;
th->dest  = inet->inet_dport;
th->seq   = htonl(tcb->seq);
th->ack_seq  = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
     (tcb->tcp_flags & TCPHDR_FLAGS_MASK));

th->check  = 0;
th->urg_ptr  = 0;

/* The urg_mode check is necessary during a below snd_una win probe */
if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
  if (before(tp->snd_up, tcb->seq + 0x10000)) {
   th->urg_ptr = htons(tp->snd_up - tcb->seq);
   th->urg = 1;
  } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
   th->urg_ptr = htons(0xFFFF);
   th->urg = 1;
  }
}

skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
  th->window      = htons(tcp_select_window(sk));
  tcp_ecn_send(sk, skb, th, tcp_header_size);
} else {
  /* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
  th->window = htons(min(tp->rcv_wnd, 65535U));
}

tcp_options_write(th, tp, NULL, &opts, &key);

if (tcp_key_is_md5(&key)) {
#ifdef CONFIG_TCP_MD5SIG
  /* Calculate the MD5 hash, as we have all we need now */
  sk_gso_disable(sk);
  tp->af_specific->calc_md5_hash(opts.hash_location,
            key.md5_key, sk, skb);
#endif
} else if (tcp_key_is_ao(&key)) {
  int err;

  err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th,
       opts.hash_location);
  if (err) {
   kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
   return -ENOMEM;
  }
}

/* BPF prog is the last one writing header option */
bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);

INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
      tcp_v6_send_check, tcp_v4_send_check,
      sk, skb);

if (likely(tcb->tcp_flags & TCPHDR_ACK))
  tcp_event_ack_sent(sk, rcv_nxt);

if (skb->len != tcp_header_size) {
  tcp_event_data_sent(tp, sk);
  tp->data_segs_out += tcp_skb_pcount(skb);
  tp->bytes_sent += skb->len - tcp_header_size;
}

if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
  TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
         tcp_skb_pcount(skb));

tp->segs_out += tcp_skb_pcount(skb);
skb_set_hash_from_sk(skb, sk);
/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);

/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */

/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
          sizeof(struct inet6_skb_parm)));

tcp_add_tx_delay(skb, tp);

err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
     inet6_csk_xmit, ip_queue_xmit,
     sk, skb, &inet->cork.fl);

if (unlikely(err > 0)) {
  tcp_enter_cwr(sk);
  err = net_xmit_eval(err);
}
if (!err && oskb) {
  tcp_update_skb_after_send(sk, oskb, prior_wstamp);
  tcp_rate_skb_sent(sk, oskb);
}
return err;
}

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
       gfp_t gfp_mask)
{
return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
      tcp_sk(sk)->rcv_nxt);
}

/* This routine just queues the buffer for sending.
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
* otherwise socket can stall.
*/
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);

/* Advance write_seq and place onto the write_queue. */
WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
}

/* Initialize TSO segments for a packet. */
static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs;

if (skb->len <= mss_now) {
  /* Avoid the costly divide in the normal
* non-TSO case.
*/
  TCP_SKB_CB(skb)->tcp_gso_size = 0;
  tcp_skb_pcount_set(skb, 1);
  return 1;
}
TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
tso_segs = DIV_ROUND_UP(skb->len, mss_now);
tcp_skb_pcount_set(skb, tso_segs);
return tso_segs;
}

/* Pcount in the middle of the write queue got changed, we need to do various
* tweaks to fix counters
*/
static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
{
struct tcp_sock *tp = tcp_sk(sk);

tp->packets_out -= decr;

if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
  tp->sacked_out -= decr;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
  tp->retrans_out -= decr;
if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
  tp->lost_out -= decr;

/* Reno case is special. Sigh... */
if (tcp_is_reno(tp) && decr > 0)
  tp->sacked_out -= min_t(u32, tp->sacked_out, decr);

tcp_verify_left_out(tp);
}

static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->txstamp_ack ||
  (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
}

static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);

if (unlikely(tcp_has_tx_tstamp(skb)) &&
     !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
  struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
  u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;

  shinfo->tx_flags &= ~tsflags;
  shinfo2->tx_flags |= tsflags;
  swap(shinfo->tskey, shinfo2->tskey);
  TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
  TCP_SKB_CB(skb)->txstamp_ack = 0;
}
}

static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
{
TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
TCP_SKB_CB(skb)->eor = 0;
}

/* Insert buff after skb on the write or rtx queue of sk.  */
static void tcp_insert_write_queue_after(struct sk_buff *skb,
      struct sk_buff *buff,
      struct sock *sk,
      enum tcp_queue tcp_queue)
{
if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
  __skb_queue_after(&sk->sk_write_queue, skb, buff);
else
  tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
}

/* Function to create two new TCP segments.  Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list.  This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
   struct sk_buff *skb, u32 len,
   unsigned int mss_now, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int old_factor;
long limit;
u16 flags;
int nlen;

if (WARN_ON(len > skb->len))
  return -EINVAL;

DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));

/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
* We need some allowance to not penalize applications setting small
* SO_SNDBUF values.
* Also allow first and last skb in retransmit queue to be split.
*/
limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
       tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
       skb != tcp_rtx_queue_head(sk) &&
       skb != tcp_rtx_queue_tail(sk))) {
  NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
  return -ENOMEM;
}

if (skb_unclone_keeptruesize(skb, gfp))
  return -ENOMEM;

/* Get a new skb... force flag on. */
buff = tcp_stream_alloc_skb(sk, gfp, true);
if (!buff)
  return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);

sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len;
buff->truesize += nlen;
skb->truesize -= nlen;

/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
tcp_skb_fragment_eor(skb, buff);

skb_split(skb, buff, len);

skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
tcp_fragment_tstamp(skb, buff);

old_factor = tcp_skb_pcount(skb);

/* Fix up tso_factor for both original and new SKB.  */
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);

/* Update delivered info for the new segment */
TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;

/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
  int diff = old_factor - tcp_skb_pcount(skb) -
   tcp_skb_pcount(buff);

  if (diff)
   tcp_adjust_pcount(sk, skb, diff);
}

/* Link BUFF into the send queue. */
__skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
  list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);

return 0;
}

/* This is similar to __pskb_pull_tail(). The difference is that pulled
* data is not copied, but immediately discarded.
*/
static int __pskb_trim_head(struct sk_buff *skb, int len)
{
struct skb_shared_info *shinfo;
int i, k, eat;

DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
eat = len;
k = 0;
shinfo = skb_shinfo(skb);
for (i = 0; i < shinfo->nr_frags; i++) {
  int size = skb_frag_size(&shinfo->frags[i]);

  if (size <= eat) {
   skb_frag_unref(skb, i);
   eat -= size;
  } else {
   shinfo->frags[k] = shinfo->frags[i];
   if (eat) {
    skb_frag_off_add(&shinfo->frags[k], eat);
    skb_frag_size_sub(&shinfo->frags[k], eat);
    eat = 0;
   }
   k++;
  }
}
shinfo->nr_frags = k;

skb->data_len -= len;
skb->len = skb->data_len;
return len;
}

/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
u32 delta_truesize;

if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
  return -ENOMEM;

delta_truesize = __pskb_trim_head(skb, len);

TCP_SKB_CB(skb)->seq += len;

skb->truesize    -= delta_truesize;
sk_wmem_queued_add(sk, -delta_truesize);
if (!skb_zcopy_pure(skb))
  sk_mem_uncharge(sk, delta_truesize);

/* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
  tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));

return 0;
}

/* Calculate MSS not accounting any TCP options.  */
static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;

/* Calculate base mss without TCP options:
   It is MMS_S - sizeof(tcphdr) of rfc1122
*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);

/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
  mss_now = tp->rx_opt.mss_clamp;

/* Now subtract optional transport overhead */
mss_now -= icsk->icsk_ext_hdr_len;

/* Then reserve room for full set of TCP options and 8 bytes of data */
mss_now = max(mss_now,
        READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
return mss_now;
}

/* Calculate MSS. Not accounting for SACKs here.  */
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
/* Subtract TCP options size, not including SACKs */
return __tcp_mtu_to_mss(sk, pmtu) -
        (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
}
EXPORT_IPV6_MOD(tcp_mtu_to_mss);

/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);

return mss +
       tp->tcp_header_len +
       icsk->icsk_ext_hdr_len +
       icsk->icsk_af_ops->net_header_len;
}
EXPORT_SYMBOL(tcp_mss_to_mtu);

/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct net *net = sock_net(sk);

icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
          icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
  icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}

/* This function synchronize snd mss to current pmtu/exthdr set.

   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
   for TCP options, but includes only bare TCP header.

   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
   It is minimum of user_mss and mss received with SYN.
   It also does not include TCP options.

   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.

   tp->mss_cache is current effective sending mss, including
   all tcp options except for SACKs. It is evaluated,
   taking into account current pmtu, but never exceeds
   tp->rx_opt.mss_clamp.

   NOTE1. rfc1122 clearly states that advertised MSS
   DOES NOT include either tcp or ip options.

   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
   are READ ONLY outside this function. --ANK (980731)
*/
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;

if (icsk->icsk_mtup.search_high > pmtu)
  icsk->icsk_mtup.search_high = pmtu;

mss_now = tcp_mtu_to_mss(sk, pmtu);
mss_now = tcp_bound_to_half_wnd(tp, mss_now);

/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
if (icsk->icsk_mtup.enabled)
  mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
tp->mss_cache = mss_now;

return mss_now;
}
EXPORT_IPV6_MOD(tcp_sync_mss);

/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*/
unsigned int tcp_current_mss(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
unsigned int header_len;
struct tcp_out_options opts;
struct tcp_key key;

mss_now = tp->mss_cache;

if (dst) {
  u32 mtu = dst_mtu(dst);
  if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
   mss_now = tcp_sync_mss(sk, mtu);
}
tcp_get_current_key(sk, &key);
header_len = tcp_established_options(sk, NULL, &opts, &key) +
       sizeof(struct tcphdr);
/* The mss_cache is sized based on tp->tcp_header_len, which assumes
* some common options. If this is an odd packet (because we have SACK
* blocks etc) then our calculated header_len will be different, and
* we have to adjust mss_now correspondingly */
if (header_len != tp->tcp_header_len) {
  int delta = (int) header_len - tp->tcp_header_len;
  mss_now -= delta;
}

return mss_now;
}

/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
* and if application hit its sndbuf limit recently.
*/
static void tcp_cwnd_application_limited(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);

if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
     sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  /* Limited by application or receiver window. */
  u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
  u32 win_used = max(tp->snd_cwnd_used, init_win);
  if (win_used < tcp_snd_cwnd(tp)) {
   tp->snd_ssthresh = tcp_current_ssthresh(sk);
   tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
  }
  tp->snd_cwnd_used = 0;
}
tp->snd_cwnd_stamp = tcp_jiffies32;
}

static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
struct tcp_sock *tp = tcp_sk(sk);

/* Track the strongest available signal of the degree to which the cwnd
* is fully utilized. If cwnd-limited then remember that fact for the
* current window. If not cwnd-limited then track the maximum number of
* outstanding packets in the current window. (If cwnd-limited then we
* chose to not update tp->max_packets_out to avoid an extra else
* clause with no functional impact.)
*/
if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
     is_cwnd_limited ||
     (!tp->is_cwnd_limited &&
      tp->packets_out > tp->max_packets_out)) {
  tp->is_cwnd_limited = is_cwnd_limited;
  tp->max_packets_out = tp->packets_out;
  tp->cwnd_usage_seq = tp->snd_nxt;
}

if (tcp_is_cwnd_limited(sk)) {
  /* Network is feed fully. */
  tp->snd_cwnd_used = 0;
  tp->snd_cwnd_stamp = tcp_jiffies32;
} else {
  /* Network starves. */
  if (tp->packets_out > tp->snd_cwnd_used)
   tp->snd_cwnd_used = tp->packets_out;

  if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
      (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
      !ca_ops->cong_control)
   tcp_cwnd_application_limited(sk);

  /* The following conditions together indicate the starvation
* is caused by insufficient sender buffer:
* 1) just sent some data (see tcp_write_xmit)
* 2) not cwnd limited (this else condition)
* 3) no more data to send (tcp_write_queue_empty())
* 4) application is hitting buffer limit (SOCK_NOSPACE)
*/
  if (tcp_write_queue_empty(sk) && sk->sk_socket &&
      test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
      (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
   tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}

/* Minshall's variant of the Nagle send check. */
static bool tcp_minshall_check(const struct tcp_sock *tp)
{
return after(tp->snd_sml, tp->snd_una) &&
  !after(tp->snd_sml, tp->snd_nxt);
}

/* Update snd_sml if this skb is under mss
* Note that a TSO packet might end with a sub-mss segment
* The test is really :
* if ((skb->len % mss) != 0)
*        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
* But we can avoid doing the divide again given we already have
*  skb_pcount = skb->len / mss_now
*/
static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
    const struct sk_buff *skb)
{
if (skb->len < tcp_skb_pcount(skb) * mss_now)
  tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}

/* Return false, if packet can be sent now without violation Nagle's rules:
* 1. It is full sized. (provided by caller in %partial bool)
* 2. Or it contains FIN. (already checked by caller)
* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
*    With Minshall's modification: all sent small packets are ACKed.
*/
static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
       int nonagle)
{
return partial &&
  ((nonagle & TCP_NAGLE_CORK) ||
   (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}

/* Return how many segs we'd like on a TSO packet,
* depending on current pacing rate, and how close the peer is.
*
* Rationale is:
* - For close peers, we rather send bigger packets to reduce
*   cpu costs, because occasional losses will be repaired fast.
* - For long distance/rtt flows, we would like to get ACK clocking
*   with 1 ACK per ms.
*
* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
* in bigger TSO bursts. We we cut the RTT-based allowance in half
* for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
* is below 1500 bytes after 6 * ~500 usec = 3ms.
*/
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
       int min_tso_segs)
{
unsigned long bytes;
u32 r;

bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);

r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
  bytes += sk->sk_gso_max_size >> r;

bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);

return max_t(u32, bytes / mss_now, min_tso_segs);
}

/* Return the number of segments we want in the skb we are transmitting.
* See if congestion control module wants to decide; otherwise, autosize.
*/
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 min_tso, tso_segs;

min_tso = ca_ops->min_tso_segs ?
   ca_ops->min_tso_segs(sk) :
   READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);

tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}

/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
     const struct sk_buff *skb,
     unsigned int mss_now,
     unsigned int max_segs,
     int nonagle)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 partial, needed, window, max_len;

window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
max_len = mss_now * max_segs;

if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
  return max_len;

needed = min(skb->len, window);

if (max_len <= needed)
  return max_len;

partial = needed % mss_now;
/* If last segment is not a full MSS, check if Nagle rules allow us
* to include this last segment in this skb.
* Otherwise, we'll split the skb at last MSS boundary
*/
if (tcp_nagle_check(partial != 0, tp, nonagle))
  return needed - partial;

return needed;
}

/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules?  If so, return how many segments are allowed.
*/
static u32 tcp_cwnd_test(const struct tcp_sock *tp)
{
u32 in_flight, cwnd, halfcwnd;

in_flight = tcp_packets_in_flight(tp);
cwnd = tcp_snd_cwnd(tp);
if (in_flight >= cwnd)
  return 0;

/* For better scheduling, ensure we have at least
* 2 GSO packets in flight.
*/
halfcwnd = max(cwnd >> 1, 1U);
return min(halfcwnd, cwnd - in_flight);
}

/* Initialize TSO state of a skb.
* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);

if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
  return tcp_set_skb_tso_segs(skb, mss_now);

return tso_segs;
}

/* Return true if the Nagle test allows this packet to be
* sent now.
*/
static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
      unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
*
* This is implemented in the callers, where they modify the 'nonagle'
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH)
  return true;

/* Don't use the nagle rule for urgent data (or for the final FIN). */
if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
  return true;

if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
  return true;

return false;
}

/* Does at least the first segment of SKB fit into the send window? */
static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
        const struct sk_buff *skb,
        unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;

if (skb->len > cur_mss)
  end_seq = TCP_SKB_CB(skb)->seq + cur_mss;

return !after(end_seq, tcp_wnd_end(tp));
}

/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
* which is put after SKB on the list.  It is very much like
* tcp_fragment() except that it may make several kinds of assumptions
* in order to speed up the splitting operation.  In particular, we
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
   unsigned int mss_now, gfp_t gfp)
{
int nlen = skb->len - len;
struct sk_buff *buff;
u16 flags;

/* All of a TSO frame must be composed of paged data.  */
DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);

buff = tcp_stream_alloc_skb(sk, gfp, true);
if (unlikely(!buff))
  return -ENOMEM;
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);

sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;

/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.21 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.