struct tcp_fastopen_metrics {
u16 mss;
u16 syn_loss:10, /* Recurring Fast Open SYN losses */
try_exp:2; /* Request w/ exp. option (once) */ unsignedlong last_syn_loss; /* Last Fast Open SYN loss */ struct tcp_fastopen_cookie cookie;
};
/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility * Kernel only stores RTT and RTTVAR in usec resolution
*/ #define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
val = 0; if (dst_metric_locked(dst, RTAX_RTT))
val |= 1 << TCP_METRIC_RTT; if (dst_metric_locked(dst, RTAX_RTTVAR))
val |= 1 << TCP_METRIC_RTTVAR; if (dst_metric_locked(dst, RTAX_SSTHRESH))
val |= 1 << TCP_METRIC_SSTHRESH; if (dst_metric_locked(dst, RTAX_CWND))
val |= 1 << TCP_METRIC_CWND; if (dst_metric_locked(dst, RTAX_REORDERING))
val |= 1 << TCP_METRIC_REORDERING; /* Paired with READ_ONCE() in tcp_metric_locked() */
WRITE_ONCE(tm->tcpm_lock, val);
spin_lock_bh(&tcp_metrics_lock);
net = dst_dev_net_rcu(dst);
/* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again.
*/
tm = __tcp_get_metrics(saddr, daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) {
reclaim = true;
tm = NULL;
} if (tm) {
tcpm_check_stamp(tm, dst); goto out_unlock;
}
if (unlikely(reclaim)) { struct tcp_metrics_block *oldest;
oldest = deref_locked(tcp_metrics_hash[hash].chain); for (tm = deref_locked(oldest->tcpm_next); tm;
tm = deref_locked(tm->tcpm_next)) { if (time_before(READ_ONCE(tm->tcpm_stamp),
READ_ONCE(oldest->tcpm_stamp)))
oldest = tm;
}
tm = oldest;
} else {
tm = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) goto out_unlock;
} /* Paired with the READ_ONCE() in tm_net() */
WRITE_ONCE(tm->tcpm_net, net);
tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr;
tcpm_suck_dst(tm, dst, reclaim);
if (likely(!reclaim)) {
tm->tcpm_next = tcp_metrics_hash[hash].chain;
rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
}
/* Save metrics learned by this TCP session. This function is called * only, when TCP finishes successfully i.e. when it enters TIME-WAIT * or goes from LAST-ACK to CLOSE.
*/ void tcp_update_metrics(struct sock *sk)
{ conststruct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsignedlong rtt;
u32 val; int m;
sk_dst_confirm(sk); if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst) return;
rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results.
*/
tm = tcp_get_metrics(sk, dst, false); if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
tcp_metric_set(tm, TCP_METRIC_RTT, 0); goto out_unlock;
} else
tm = tcp_get_metrics(sk, dst, true);
if (!tm) goto out_unlock;
rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
m = rtt - tp->srtt_us;
/* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is * always better than underestimation.
*/ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0)
rtt = tp->srtt_us; else
rtt -= (m >> 3);
tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
}
if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { unsignedlong var;
if (m < 0)
m = -m;
/* Scale deviation to rttvar fixed point */
m >>= 1; if (m < tp->mdev_us)
m = tp->mdev_us;
var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var)
var = m; else
var -= (var - m) >> 2;
tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
}
if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tcp_snd_cwnd(tp) >> 1) > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
tcp_snd_cwnd(tp) >> 1);
} if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
val = tcp_metric_get(tm, TCP_METRIC_CWND); if (tcp_snd_cwnd(tp) > val)
tcp_metric_set(tm, TCP_METRIC_CWND,
tcp_snd_cwnd(tp));
}
} elseif (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
val = tcp_metric_get(tm, TCP_METRIC_CWND);
tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1);
}
} else { /* Else slow start did not finish, cwnd is non-sense, * ssthresh may be also invalid.
*/ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
val = tcp_metric_get(tm, TCP_METRIC_CWND);
tcp_metric_set(tm, TCP_METRIC_CWND,
(val + tp->snd_ssthresh) >> 1);
} if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
tp->snd_ssthresh);
} if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering &&
tp->reordering !=
READ_ONCE(net->ipv4.sysctl_tcp_reordering))
tcp_metric_set(tm, TCP_METRIC_REORDERING,
tp->reordering);
}
}
WRITE_ONCE(tm->tcpm_stamp, jiffies);
out_unlock:
rcu_read_unlock();
}
sk_dst_confirm(sk); /* ssthresh may have been reduced unnecessarily during. * 3WHS. Restore it back to its initial default.
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; if (!dst) goto reset;
if (tcp_metric_locked(tm, TCP_METRIC_CWND))
tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) {
tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
tp->snd_ssthresh = tp->snd_cwnd_clamp;
}
val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val)
tp->reordering = val;
crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
rcu_read_unlock();
reset: /* The initial RTT measurement from the SYN/SYN-ACK is not ideal * to seed the RTO for later data packets because SYN packets are * small. Use the per-dst cached values to seed the RTO but keep * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). * Later the RTO will be updated immediately upon obtaining the first * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only * influences the first RTO but not later RTT estimation. * * But if RTT is not available from the SYN (due to retransmits or * syn cookies) or the cache, force a conservative 3secs timeout. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever * tricks sort of "quick acks" for time long enough to decrease RTT * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles.
*/ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
crtt /= 8 * USEC_PER_SEC / HZ;
inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
} elseif (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious * retransmission.
*/
tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
u32 val = tcp_metric_get(tm, i);
if (!val) continue; if (i == TCP_METRIC_RTT) { if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
val) < 0) goto nla_put_failure;
n++;
val = max(val / 1000, 1U);
} if (i == TCP_METRIC_RTTVAR) { if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
val) < 0) goto nla_put_failure;
n++;
val = max(val / 1000, 1U);
} if (nla_put_u32(msg, i + 1, val) < 0) goto nla_put_failure;
n++;
} if (n)
nla_nest_end(msg, nest); else
nla_nest_cancel(msg, nest);
}
staticint __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, unsignedint *hash, int optional, int v4, int v6)
{ struct nlattr *a;
a = info->attrs[v4]; if (a) {
inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash)
*hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0;
}
a = info->attrs[v6]; if (a) { struct in6_addr in6;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.