/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP module. * * Version: @(#)tcp.h 1.0.5 05/23/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*/ #ifndef _TCP_H #define _TCP_H
#define TCP_RETR1 3 /* * This is how many retries it does before it * tries to figure out if the gateway is * down. Minimal RFC value is 3; it corresponds * to ~3sec-8min depending on RTO.
*/
#define TCP_RETR2 15 /* * This should take at least * 90 minutes to time out. * RFC1122 says that the limit is 100 sec. * 15 is ~13-30min depending on RTO.
*/
#define TCP_SYN_RETRIES 6 /* This is how many retries are done * when active opening a connection. * RFC1122 says the minimum retry MUST * be at least 180secs. Nevertheless * this value is corresponding to * 63secs of retransmission with the * current initial RTO.
*/
#define TCP_SYNACK_RETRIES 5 /* This is how may retries are done * when passive opening a connection. * This is corresponding to 31secs of * retransmission with the current * initial RTO.
*/
#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */ #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN /* BSD style FIN_WAIT2 deadlock breaker. * It used to be 3min, new value is 60sec, * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer.
*/ #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) #else #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif #define TCP_RTO_MAX_SEC 120 #define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ)) #define TCP_RTO_MIN ((unsigned)(HZ / 5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS.
*/
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources.
*/ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ)
/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds * to avoid overflows. This assumes a clock smaller than 1 Mhz. * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
*/ #define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN * to provide reliability equal to one * provided by timewait state.
*/ #define TCP_PAWS_WINDOW 1 /* Replay window for per-host * timestamps. It must be less than * minimal timewait lifetime.
*/ /* * TCP option
*/
#define TCPOPT_NOP 1 /* Padding */ #define TCPOPT_EOL 0 /* End of options */ #define TCPOPT_MSS 2 /* Segment size negotiating */ #define TCPOPT_WINDOW 3 /* Window scaling */ #define TCPOPT_SACK_PERM 4 /* SACK Permitted */ #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
*/ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9
/* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 #define TCPOLEN_WSCALE_ALIGNED 4 #define TCPOLEN_SACKPERM_ALIGNED 4 #define TCPOLEN_SACK_BASE 2 #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
/* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
/* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */
/* TCP initial congestion window as per rfc6928 */ #define TCP_INIT_CWND 10
/* Bit Flags for sysctl_tcp_fastopen */ #define TFO_CLIENT_ENABLE 1 #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */
/* Accept SYN data w/o any cookie option */ #define TFO_SERVER_COOKIE_NOT_REQD 0x200
/* Force enable TFO on all listeners, i.e., not requiring the * TCP_FASTOPEN socket option.
*/ #define TFO_SERVER_WO_SOCKOPT1 0x400
/* sysctl variables for tcp */ externint sysctl_tcp_max_orphans; externlong sysctl_tcp_mem[3];
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */
/* optimized version of sk_under_memory_pressure() for TCP sockets */ staticinlinebool tcp_under_memory_pressure(conststruct sock *sk)
{ if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
mem_cgroup_under_socket_pressure(sk->sk_memcg)) returntrue;
return READ_ONCE(tcp_memory_pressure);
} /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic).
*/
/* Syncookies use a monotonic timer which increments every 60 seconds. * This counter is used both as a hash input and partially encoded into * the cookie value. A cookie is only validated further if the delta * between the current counter value and the encoded one is less than this, * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if * the counter advances immediately after a cookie is generated).
*/ #define MAX_SYNCOOKIE_AGE 2 #define TCP_SYNCOOKIE_PERIOD (60 * HZ) #define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)
/* syncookies: remember time of last synqueue overflow * But do not dirty this field too often (once per second is enough) * It is racy as we do not hold a lock, but race is very minor.
*/ staticinlinevoid tcp_synq_overflow(conststruct sock *sk)
{ unsignedint last_overflow; unsignedint now = jiffies;
if (sk->sk_reuseport) { struct sock_reuseport *reuse;
reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) {
last_overflow = READ_ONCE(reuse->synq_overflow_ts); if (!time_between32(now, last_overflow,
last_overflow + HZ))
WRITE_ONCE(reuse->synq_overflow_ts, now); return;
}
}
/* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use * 'last_overflow - HZ' as lower bound. That's because a concurrent * tcp_synq_overflow() could update .ts_recent_stamp after we read * jiffies but before we store .ts_recent_stamp into last_overflow, * which could lead to rejecting a valid syncookie.
*/ return !time_between32(now, last_overflow - HZ,
last_overflow + TCP_SYNCOOKIE_VALID);
}
staticinline u32 tcp_cookie_time(void)
{
u64 val = get_jiffies_64();
do_div(val, TCP_SYNCOOKIE_PERIOD); return val;
}
/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ staticinline u64 tcp_ns_to_ts(bool usec_ts, u64 val)
{ if (usec_ts) return div_u64(val, NSEC_PER_USEC);
/* Bound MSS / TSO packet size with the half of the window */ staticinlineint tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{ int cutoff;
/* When peer uses tiny windows, there is no use in packetizing * to sub-MSS pieces for the sake of SWS or making sure there * are enough packets in the pipe for fast recovery. * * On the other hand, for extremely large MSS devices, handling * smaller than MSS windows in this way does make sense.
*/ if (tp->max_window > TCP_MSS_DEFAULT)
cutoff = (tp->max_window >> 1); else
cutoff = tp->max_window;
staticinlinevoid __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{ /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return;
/* Minimum RTT in usec. ~0 means not available. */ staticinline u32 tcp_min_rtt(conststruct tcp_sock *tp)
{ return minmax_get(&tp->rtt_min);
}
/* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window.
*/ staticinline u32 tcp_receive_window(conststruct tcp_sock *tp)
{
s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
if (win < 0)
win = 0; return (u32) win;
}
/* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection.
*/
u32 __tcp_select_window(struct sock *sk);
void tcp_send_window_probe(struct sock *sk);
/* TCP uses 32bit jiffies to save some space. * Note that this is different from tcp_time_stamp, which * historically has been the same until linux-4.13.
*/ #define tcp_jiffies32 ((u32)jiffies)
/* * Deliver a 32bit value for TCP timestamp option (RFC 7323) * It is no longer tied to jiffies, but to 1 ms clock. * Note: double check if you want to use tcp_jiffies32 instead of this.
*/ #define TCP_TS_HZ 1000
/* TCP Timestamp included in TS option (RFC 1323) can either use ms * or usec resolution. Each socket carries a flag to select one or other * resolution, as the route attribute could change anytime. * Each flow must stick to initial resolution.
*/ staticinline u32 tcp_clock_ts(bool usec_ts)
{ return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}
/* provide the departure time in us unit */ staticinline u64 tcp_skb_timestamp_us(conststruct sk_buff *skb)
{ return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}
/* Provide skb TSval in usec or ms unit */ staticinline u32 tcp_skb_timestamp_ts(bool usec_ts, conststruct sk_buff *skb)
{ if (usec_ts) return tcp_skb_timestamp_us(skb);
/* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags {
TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */
TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */
TCPCB_LOST = (1 << 2), /* SKB is lost */
TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS |
TCPCB_LOST), /* All tag bits */
TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */
TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */
TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS |
TCPCB_REPAIRED),
};
/* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. * This is 44 bytes if IPV6 is enabled. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
*/ struct tcp_skb_cb {
__u32 seq; /* Starting sequence number */
__u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss()
*/ struct {
u16 tcp_gso_segs;
u16 tcp_gso_size;
};
};
__u16 tcp_flags; /* TCP header flags (tcp[12-13])*/
__u8 sacked; /* State flags for SACK. */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ #define TSTAMP_ACK_SK 0x1 #define TSTAMP_ACK_BPF 0x2
__u8 txstamp_ack:2, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
unused:4;
__u32 ack_seq; /* Sequence number ACK'd */ union { struct { #define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */
__u32 is_app_limited:1, /* cwnd not fully used? */
delivered_ce:20,
unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered; /* start of send pipeline phase */
u64 first_tx_mstamp; /* when we reached the "delivered" count */
u64 delivered_mstamp;
} tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif
} header; /* For incoming skbs */
};
};
#if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[]
*/ staticinlineint tcp_v6_iif(conststruct sk_buff *skb)
{ return TCP_SKB_CB(skb)->header.h6.iif;
}
/* TCP_SKB_CB reference means this can not be used from early demux */ staticinlineint tcp_v6_sdif(conststruct sk_buff *skb)
{ #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) return TCP_SKB_CB(skb)->header.h6.iif; #endif return 0;
}
/* TCP_SKB_CB reference means this can not be used from early demux */ staticinlineint tcp_v4_sdif(struct sk_buff *skb)
{ #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return TCP_SKB_CB(skb)->header.h4.iif; #endif return 0;
}
/* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this.
*/ staticinlineint tcp_skb_pcount(conststruct sk_buff *skb)
{ return TCP_SKB_CB(skb)->tcp_gso_segs;
}
/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ staticinlineint tcp_skb_mss(conststruct sk_buff *skb)
{ return TCP_SKB_CB(skb)->tcp_gso_size;
}
/* Events passed to congestion control interface */ enum tcp_ca_event {
CA_EVENT_TX_START, /* first transmit when no packets in flight */
CA_EVENT_CWND_RESTART, /* congestion window restart */
CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
CA_EVENT_LOSS, /* loss timeout */
CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
};
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags {
CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
};
/* * Interface for adding new TCP congestion control handlers
*/ #define TCP_CA_NAME_MAX 16 #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
#define TCP_CA_UNSPEC 0
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED BIT(0) /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN BIT(1) #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
/* A rate sample measures the number of (original/retransmitted) data * packets delivered "delivered" over an interval of time "interval_us". * The tcp_rate.c code fills in the rate sample, and congestion * control modules that define a cong_control function to run at the end * of ACK processing can optionally chose to consult this sample when * setting cwnd and pacing rate. * A sample is invalid if "delivered" or "interval_us" is negative.
*/ struct rate_sample {
u64 prior_mstamp; /* starting timestamp for interval */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
s32 delivered; /* number of packets delivered over interval */
s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */
u32 snd_interval_us; /* snd interval for delivered packets */
u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */
u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
u32 prior_in_flight; /* in flight before this ACK */
u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */
};
struct tcp_congestion_ops { /* fast path fields are put first to fill one cache line */
/* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional)
*/ void (*cong_control)(struct sock *sk, u32 ack, int flag, conststruct rate_sample *rs);
/* new value of cwnd after loss (required) */
u32 (*undo_cwnd)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */
u32 (*sndbuf_expand)(struct sock *sk);
/* control/slow paths put last */ /* get info for inet_diag (optional) */
size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info);
/* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK
*/ staticinlineint tcp_is_sack(conststruct tcp_sock *tp)
{ return likely(tp->rx_opt.sack_ok);
}
/* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK * blocks etc.) we can make more aggressive calculations. * * Use this for decisions involving congestion control, use just * tp->packets_out to determine if the send queue is empty or not. * * Read this equation as: * * "Packets sent once on transmission queue" MINUS * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted"
*/ staticinlineunsignedint tcp_packets_in_flight(conststruct tcp_sock *tp)
{ return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}
/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is cwnd reduction phase, when cwnd is decreasing towards * ssthresh.
*/ staticinline __u32 tcp_current_ssthresh(conststruct sock *sk)
{ conststruct tcp_sock *tp = tcp_sk(sk);
/* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out)
/* The maximum number of MSS of available cwnd for which TSO defers * sending if not using sysctl_tcp_tso_win_divisor.
*/ staticinline __u32 tcp_max_tso_deferred_mss(conststruct tcp_sock *tp)
{ return 3;
}
/* Returns end sequence number of the receiver's advertised window */ staticinline u32 tcp_wnd_end(conststruct tcp_sock *tp)
{ return tp->snd_una + tp->snd_wnd;
}
/* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless * it was fully used previously. And that's exactly what we do in * congestion avoidance mode. But in slow start we allow cwnd to grow * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. * This check is safe because it's as aggressive as slow start which already * risks 100% overshoot. The advantage is that we discourage application to * either send more filler packets or data to artificially blow up the cwnd * usage, and allow application-limited process to probe bw more aggressively.
*/ staticinlinebool tcp_is_cwnd_limited(conststruct sock *sk)
{ conststruct tcp_sock *tp = tcp_sk(sk);
if (tp->is_cwnd_limited) returntrue;
/* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out;
returnfalse;
}
/* BBR congestion control needs pacing. * Same remark for SO_MAX_PACING_RATE. * sch_fq packet scheduler is efficiently handling pacing, * but is not always installed/used. * Return true if TCP stack should pace packets itself.
*/ staticinlinebool tcp_needs_internal_pacing(conststruct sock *sk)
{ return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
}
/* Estimates in how many jiffies next packet for this flow can be sent. * Scheduling a retransmit timer too early would be silly.
*/ staticinlineunsignedlong tcp_pacing_delay(conststruct sock *sk)
{
s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;
/* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window, or we are paced. * We do not want to add fuel to the fire, or abort too early, * so make sure the timer we arm now is at least 200ms in the future, * regardless of current icsk_rto value (as it could be ~2ms)
*/ staticinlineunsignedlong tcp_probe0_base(conststruct sock *sk)
{ return max_t(unsignedlong, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
}
/* Variant of inet_csk_rto_backoff() used for zero window probes */ staticinlineunsignedlong tcp_probe0_when(conststruct sock *sk, unsignedlong max_when)
{
u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1,
inet_csk(sk)->icsk_backoff);
u64 when = (u64)tcp_probe0_base(sk) << backoff;
/* Assume a 50% default for skb->len/skb->truesize ratio. * This may be adjusted later in tcp_measure_rcv_mss().
*/ #define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1))
void tcp_cleanup_rbuf(struct sock *sk, int copied); void __tcp_cleanup_rbuf(struct sock *sk, int copied);
/* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override * SO_RCVLOWAT constraint, since we are receiving skbs with too small * len/truesize ratio.
*/ staticinlinebool tcp_rmem_pressure(conststruct sock *sk)
{ int rcvbuf, threshold;
staticinlinebool tcp_paws_check(conststruct tcp_options_received *rx_opt, int paws_win)
{ if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) returntrue; if (unlikely(!time_before32(ktime_get_seconds(),
rx_opt->ts_recent_stamp + TCP_PAWS_WRAP))) returntrue; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, * then following tcp messages have valid values. Ignore 0 value, * or else 'negative' tsval might forbid us to accept their packets.
*/ if (!rx_opt->ts_recent) returntrue; returnfalse;
}
staticinlinebool tcp_paws_reject(conststruct tcp_options_received *rx_opt, int rst)
{ if (tcp_paws_check(rx_opt, 0)) returnfalse;
/* RST segments are not recommended to carry timestamp, and, if they do, it is recommended to ignore PAWS because "their cleanup function should take precedence over timestamps." Certainly, it is mistake. It is necessary to understand the reasons of this constraint to relax it: if peer reboots, clock may go out-of-sync and half-open connections will not be reset. Actually, the problem would be not existing if all the implementations followed draft about maintaining clock via reboots. Linux-2.2 DOES NOT!
However, we can relax time bounds for RST segments to MSL.
*/ if (rst && !time_before32(ktime_get_seconds(),
rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) returnfalse; returntrue;
}
bool tcp_oow_rate_limited(struct net *net, conststruct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time);
/* * struct tcp_sigpool - per-CPU pool of ahash_requests * @scratch: per-CPU temporary area, that can be used between * tcp_sigpool_start() and tcp_sigpool_end() to perform * crypto request * @req: pre-allocated ahash request
*/ struct tcp_sigpool { void *scratch; struct ahash_request *req;
};
int tcp_sigpool_alloc_ahash(constchar *alg, size_t scratch_size); void tcp_sigpool_get(unsignedint id); void tcp_sigpool_release(unsignedint id); int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, conststruct sk_buff *skb, unsignedint header_len);
/** * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @c: returned tcp_sigpool for usage (uninitialized on failure) * * Returns: 0 on success, error otherwise.
*/ int tcp_sigpool_start(unsignedint id, struct tcp_sigpool *c); /** * tcp_sigpool_end - enable bh and stop using tcp_sigpool * @c: tcp_sigpool context that was returned by tcp_sigpool_start()
*/ void tcp_sigpool_end(struct tcp_sigpool *c);
size_t tcp_sigpool_algo(unsignedint id, char *buf, size_t buf_len); /* - functions */ int tcp_v4_md5_hash_skb(char *md5_hash, conststruct tcp_md5sig_key *key, conststruct sock *sk, conststruct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, constunion tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen); int tcp_md5_key_copy(struct sock *sk, constunion tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, struct tcp_md5sig_key *key);
int tcp_md5_do_del(struct sock *sk, constunion tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags); void tcp_clear_md5_list(struct sock *sk); struct tcp_md5sig_key *tcp_v4_md5_lookup(conststruct sock *sk, conststruct sock *addr_sk);
#ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *__tcp_md5_do_lookup(conststruct sock *sk, int l3index, constunion tcp_md5_addr *addr, int family, bool any_l3index); staticinlinestruct tcp_md5sig_key *
tcp_md5_do_lookup(conststruct sock *sk, int l3index, constunion tcp_md5_addr *addr, int family)
{ if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family, false);
}
staticinline int tcp_fastopen_context_len(conststruct tcp_fastopen_context *ctx)
{ return ctx->num;
}
/* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive.
*/ enum tcp_chrono {
TCP_CHRONO_UNSPEC,
TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
__TCP_CHRONO_MAX,
};
/* This helper is needed, because skb->tcp_tsorted_anchor uses * the same memory storage than skb->destructor/_skb_refdst
*/ staticinlinevoid tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
{
skb->destructor = NULL;
skb->_skb_refdst = 0UL;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.