// SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) * * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com> * * Meant to be mostly used for locally generated traffic : * Fast classification depends on skb->sk being set before reaching us. * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. * All packets belonging to a socket are considered as a 'flow'. * * Flows are dynamically allocated and stored in a hash table of RB trees * They are also part of one Round Robin 'queues' (new or old flows) * * Burst avoidance (aka pacing) capability : * * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a * bunch of packets, and this packet scheduler adds delay between * packets to respect rate limitation. * * enqueue() : * - lookup one RB tree (out of 1024 or more) to find the flow. * If non existent flow, create it, add it to the tree. * Add skb to the per flow list of skb (fifo). * - Use a special fifo for high prio packets * * dequeue() : serves flows in Round Robin * Note : When a flow becomes empty, we do not immediately remove it from * rb trees, for performance reasons (its expected to send additional packets, * or SLAB cache will reuse socket for another flow)
*/
/* * Per flow structure, dynamically allocated. * If packets have monotically increasing time_to_send, they are placed in O(1) * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/ struct fq_flow { /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ unsignedlong age; /* (jiffies | 1UL) when flow was emptied, for gc */
}; union { struct rb_node fq_node; /* anchor in fq_root[] trees */ /* Following field is only used for q->internal, * because q->internal is not hashed in fq_root[]
*/
u64 stat_fastpath_packets;
}; struct sock *sk;
u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */
/* Second cache line */ int credit; int band; struct fq_flow *next; /* next pointer in RR lists */
struct rb_node rate_node; /* anchor in q->delayed tree */
u64 time_next_packet;
};
struct fq_perband_flows { struct fq_flow_head new_flows; struct fq_flow_head old_flows; int credit; int quantum; /* based on band nr : 576KB, 192KB, 64KB */
};
/* * f->tail and f->age share the same location. * We can use the low order bit to differentiate if this location points * to a sk_buff or contains a jiffies value, if we force this value to be odd. * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
*/ staticvoid fq_flow_set_detached(struct fq_flow *f)
{
f->age = jiffies | 1UL;
}
/* Fast path can be used if : * 1) Packet tstamp is in the past, or within the pacing offload horizon. * 2) FQ qlen == 0 OR * (no flow is currently eligible for transmit, * AND fast path queue has less than 8 packets) * 3) No SO_MAX_PACING_RATE on the socket (if any). * 4) No @maxrate attribute on this qdisc, * * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
*/ staticbool fq_fastpath_check(conststruct Qdisc *sch, struct sk_buff *skb,
u64 now)
{ conststruct fq_sched_data *q = qdisc_priv(sch); conststruct sock *sk;
if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon) returnfalse;
if (sch->q.qlen != 0) { /* Even if some packets are stored in this qdisc, * we can still enable fast path if all of them are * scheduled in the future (ie no flows are eligible) * or in the fast path queue.
*/ if (q->flows != q->inactive_flows + q->throttled_flows) returnfalse;
/* Do not allow fast path queue to explode, we want Fair Queue mode * under pressure.
*/ if (q->internal.qlen >= 8) returnfalse;
/* Ordering invariants fall apart if some delayed flows * are ready but we haven't serviced them, yet.
*/ if (q->time_next_delayed_flow <= now + q->offload_horizon) returnfalse;
}
sk = skb->sk; if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
sk->sk_max_pacing_rate != ~0UL) returnfalse;
/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket * or a listener (SYNCOOKIE mode) * 1) request sockets are not full blown, * they do not contain sk_pacing_rate * 2) They are not part of a 'flow' yet * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE * 4) We pretend they are orphaned * TCP can also associate TIME_WAIT sockets with RST or ACK packets.
*/ if (!sk || sk_listener_or_tw(sk)) { unsignedlong hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not * collide with a local flow (socket pointers are word aligned)
*/
sk = (struct sock *)((hash << 1) | 1UL);
skb_orphan(skb);
} elseif (sk->sk_state == TCP_CLOSE) { unsignedlong hash = skb_get_hash(skb) & q->orphan_mask; /* * Sockets in TCP_CLOSE are non connected. * Typical use case is UDP sockets, they can send packets * with sendto() to many different destinations. * We probably could use a generic bit advertising * non connected sockets, instead of sk_state == TCP_CLOSE, * if we care enough.
*/
sk = (struct sock *)((hash << 1) | 1UL);
}
if (fq_fastpath_check(sch, skb, now)) {
q->internal.stat_fastpath_packets++; if (skb->sk == sk && q->rate_enable &&
READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ)
smp_store_release(&sk->sk_pacing_status,
SK_PACING_FQ); return &q->internal;
}
p = &root->rb_node;
parent = NULL; while (*p) {
parent = *p;
f = rb_entry(parent, struct fq_flow, fq_node); if (f->sk == sk) { /* socket might have been reallocated, so check * if its sk_hash is the same. * It not, we need to refill credit with * initial quantum
*/ if (unlikely(skb->sk == sk &&
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash; if (q->rate_enable)
smp_store_release(&sk->sk_pacing_status,
SK_PACING_FQ); if (fq_flow_is_throttled(f))
fq_flow_unset_throttled(q, f);
f->time_next_packet = 0ULL;
} return f;
} if (f->sk > sk)
p = &parent->rb_right; else
p = &parent->rb_left;
}
f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!f)) {
q->stat_allocation_errors++; return &q->internal;
} /* f->t_root is already zeroed after kmem_cache_zalloc() */
fq_flow_set_detached(f);
f->sk = sk; if (skb->sk == sk) {
f->socket_hash = sk->sk_hash; if (q->rate_enable)
smp_store_release(&sk->sk_pacing_status,
SK_PACING_FQ);
}
f->credit = q->initial_quantum;
/* Remove one skb from flow queue. * This skb must be the return value of prior fq_peek().
*/ staticvoid fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb)
{
fq_erase_head(sch, flow, skb);
skb_mark_not_on_list(skb);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
}
band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
q->stat_band_drops[band]++; return qdisc_drop_reason(skb, sch, to_free,
FQDR(BAND_LIMIT));
}
now = ktime_get_ns(); if (!skb->tstamp) {
fq_skb_cb(skb)->time_to_send = now;
} else { /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q, now)) { if (q->horizon_drop) {
q->stat_horizon_drops++; return qdisc_drop_reason(skb, sch, to_free,
FQDR(HORIZON_LIMIT));
}
q->stat_horizon_caps++;
skb->tstamp = now + q->horizon;
}
fq_skb_cb(skb)->time_to_send = skb->tstamp;
}
f = fq_classify(sch, skb, now);
if (f != &q->internal) { if (unlikely(f->qlen >= q->flow_plimit)) {
q->stat_flows_plimit++; return qdisc_drop_reason(skb, sch, to_free,
FQDR(FLOW_LIMIT));
}
if (fq_flow_is_detached(f)) {
fq_flow_add_tail(q, f, NEW_FLOW); if (time_after(jiffies, f->age + q->flow_refill_delay))
f->credit = max_t(u32, f->credit, q->quantum);
}
/* If EDT time was provided for this skb, we need to * update f->time_next_packet only if this qdisc enforces * a flow max rate.
*/ if (!skb->tstamp) { if (skb->sk)
rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
if (rate <= q->low_rate_threshold) {
f->credit = 0;
} else {
plen = max(plen, q->quantum); if (f->credit > 0) goto out;
}
} if (rate != ~0UL) {
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
len = div64_ul(len, rate); /* Since socket rate can change later, * clamp the delay to 1 second. * Really, providers of too big packets should be fixed !
*/ if (unlikely(len > NSEC_PER_SEC)) {
len = NSEC_PER_SEC;
q->stat_pkts_too_long++;
} /* Account for schedule/timers drifts. * f->time_next_packet was set when prior packet was sent, * and current time (@now) can be too late by tens of us.
*/ if (f->time_next_packet)
len -= min(len/2, now - f->time_next_packet);
f->time_next_packet = now + len;
}
out:
qdisc_bstats_update(sch, skb); return skb;
}
if (q->fq_root && log == q->fq_trees_log) return 0;
/* If XPS was setup, we can allocate memory on right NUMA node */
array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM;
/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ staticvoid fq_prio2band_compress_crumb(const u8 *in, u8 *out)
{ constint num_elems = TC_PRIO_MAX + 1;
u8 tmp[FQ_PRIO2BAND_CRUMB_SIZE]; int i;
memset(tmp, 0, sizeof(tmp)); for (i = 0; i < num_elems; i++)
tmp[i / 4] |= in[i] << (2 * (i & 0x3));
for (i = 0; i < FQ_PRIO2BAND_CRUMB_SIZE; i++)
WRITE_ONCE(out[i], tmp[i]);
}
for (i = 0; i < FQ_BANDS; i++) { if (weights[i] < FQ_MIN_WEIGHT) {
NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
weights[i], FQ_MIN_WEIGHT); return -EINVAL;
}
} for (i = 0; i < FQ_BANDS; i++)
WRITE_ONCE(q->band_flows[i].quantum, weights[i]); return 0;
}
if (map->bands != FQ_BANDS) {
NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands"); return -EINVAL;
} for (i = 0; i < TC_PRIO_MAX + 1; i++) { if (map->priomap[i] >= FQ_BANDS) {
NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
i, map->priomap[i]); return -EINVAL;
}
}
fq_prio2band_compress_crumb(map->priomap, q->prio2band); return 0;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.