/* WARNING: This implementation is not necessarily the same * as the tcp_cubic.c. The purpose is mainly for testing * the kernel BPF logic. * * Highlights: * 1. CONFIG_HZ .kconfig map is used. * 2. In bictcp_update(), calculation is changed to use usec * resolution (i.e. USEC_PER_JIFFY) instead of using jiffies. * Thus, usecs_to_jiffies() is not used in the bpf_cubic.c. * 3. In bitctcp_update() [under tcp_friendliness], the original * "while (ca->ack_cnt > delta)" loop is changed to the equivalent * "ca->ack_cnt / delta" operation.
*/
staticconst __u32 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ staticconst __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
/ (BICTCP_BETA_SCALE - beta); /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 * so K = cubic_root( (wmax-cwnd)*rtt/c ) * the unit of K is bictcp_HZ=2^10, not HZ * * c = bic_scale >> 10 * rtt = 100ms * * the following code has been designed and tested for * cwnd < 1 million packets * RTT < 100 seconds * HZ < 1,000,00 (corresponding to 10 nano-second)
*/
/* BIC TCP Parameters */ struct bpf_bictcp {
__u32 cnt; /* increase cwnd by 1 after ACKs */
__u32 last_max_cwnd; /* last maximum snd_cwnd */
__u32 last_cwnd; /* the last snd_cwnd */
__u32 last_time; /* time when updated last_cwnd */
__u32 bic_origin_point;/* origin point of bic function */
__u32 bic_K; /* time to origin point
from the beginning of the current epoch */
__u32 delay_min; /* min delay (usec) */
__u32 epoch_start; /* beginning of an epoch */
__u32 ack_cnt; /* number of acks */
__u32 tcp_cwnd; /* estimated tcp cwnd */
__u16 unused;
__u8 sample_cnt; /* number of samples to decide curr_rtt */
__u8 found; /* the exit point is found? */
__u32 round_start; /* beginning of each round */
__u32 end_seq; /* end_seq of the round */
__u32 last_ack; /* last time when the ACK spacing is close */
__u32 curr_rtt; /* the minimum rtt of current round */
};
#define BITS_PER_U64 (sizeof(__u64) * 8) staticint fls64(__u64 x)
{ int num = BITS_PER_U64 - 1;
if (x == 0) return 0;
if (!(x & (~0ull << (BITS_PER_U64-32)))) {
num -= 32;
x <<= 32;
} if (!(x & (~0ull << (BITS_PER_U64-16)))) {
num -= 16;
x <<= 16;
} if (!(x & (~0ull << (BITS_PER_U64-8)))) {
num -= 8;
x <<= 8;
} if (!(x & (~0ull << (BITS_PER_U64-4)))) {
num -= 4;
x <<= 4;
} if (!(x & (~0ull << (BITS_PER_U64-2)))) {
num -= 2;
x <<= 2;
} if (!(x & (~0ull << (BITS_PER_U64-1))))
num -= 1;
/* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve.
*/ if (ca->epoch_start && delta > 0) {
ca->epoch_start += delta; if (after(ca->epoch_start, now))
ca->epoch_start = now;
} return;
}
}
/* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195%
*/ static __u32 cubic_root(__u64 a)
{
__u32 x, b, shift;
if (a < 64) { /* a in [0..63] */ return ((__u32)v[(__u32)a] + 35) >> 6;
}
b = fls64(a);
b = ((b * 84) >> 8) - 1;
shift = (a >> (b * 3));
/* it is needed for verifier's bound check on v */ if (shift >= 64) return 0;
x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6;
/* * Newton-Raphson iteration * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k
*/
x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1)));
x = ((x * 341) >> 10); return x;
}
/* The CUBIC function can update ca->cnt at most once per jiffy. * On all cwnd reduction events, ca->epoch_start is set to 0, * which will force a recalculation of ca->cnt.
*/ if (ca->epoch_start && tcp_jiffies32 == ca->last_time) goto tcp_friendliness;
if (ca->epoch_start == 0) {
ca->epoch_start = tcp_jiffies32; /* record beginning */
ca->ack_cnt = acked; /* start counting */
ca->tcp_cwnd = cwnd; /* syn with cubic */
if (ca->last_max_cwnd <= cwnd) {
ca->bic_K = 0;
ca->bic_origin_point = cwnd;
} else { /* Compute new K based on * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
*/
ca->bic_K = cubic_root(cube_factor
* (ca->last_max_cwnd - cwnd));
ca->bic_origin_point = ca->last_max_cwnd;
}
}
/* cubic function - calc*/ /* calculate c * time^3 / rtt, * while considering overflow in calculation of time^3 * (so time^3 is done by using 64 bit) * and without the support of division of 64bit numbers * (so all divisions are done by using 32 bit) * also NOTE the unit of those variables * time = (t - K) / 2^bictcp_HZ * c = bic_scale >> 10 * rtt = (srtt >> 3) / HZ * !!! The following code does not have overflow problems, * if the cwnd < 1 million packets !!!
*/
t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
t += ca->delay_min; /* change the unit from usec to bictcp_HZ */
t <<= BICTCP_HZ;
t /= USEC_PER_SEC;
if (t < ca->bic_K) /* t - K */
offs = ca->bic_K - t; else
offs = t - ca->bic_K;
/* cubic function - calc bictcp_cnt*/ if (bic_target > cwnd) {
ca->cnt = cwnd / (bic_target - cwnd);
} else {
ca->cnt = 100 * cwnd; /* very small increment*/
}
/* * The initial growth of cubic function may be too conservative * when the available bandwidth is still unknown.
*/ if (ca->last_max_cwnd == 0 && ca->cnt > 20)
ca->cnt = 20; /* increase cwnd 5% per RTT */
if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
delta = ca->tcp_cwnd - cwnd;
max_cnt = cwnd / delta; if (ca->cnt > max_cnt)
ca->cnt = max_cnt;
}
}
/* The maximum rate of cwnd increase CUBIC allows is 1 packet per * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
*/
ca->cnt = max(ca->cnt, 2U);
}
/* Account for TSO/GRO delays. * Otherwise short RTT flows could get too small ssthresh, since during * slow start we begin with small TSO packets and ca->delay_min would * not account for long aggregation delay when TSO packets get bigger. * Ideally even with a very small RTT we would like to have at least one * TSO packet being sent and received by GRO, and another one in qdisc layer. * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms.
*/ static __u32 hystart_ack_delay(struct sock *sk)
{ unsignedlong rate;
/* Hystart ack train triggers if we get ack past * ca->delay_min/2. * Pacing might have delayed packets up to RTT/2 * during slow start.
*/ if (sk->sk_pacing_status == SK_PACING_NONE)
threshold >>= 1;
bpf_cubic_acked_called = 1; /* Some calls are for duplicates without timestamps */ if (sample->rtt_us < 0) return;
/* Discard delay samples right after fast recovery */ if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return;
delay = sample->rtt_us; if (delay == 0)
delay = 1;
/* first time call or link delay decreases */ if (ca->delay_min == 0 || ca->delay_min > delay)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */ if (!ca->found && tcp_in_slow_start(tp) && hystart &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.