struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes
*/
u64 gate_duration[TC_MAX_QUEUE];
atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time
*/
ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */
ktime_t end_time;
ktime_t next_txtime; int index;
u32 gate_mask;
u32 interval;
u8 command;
};
struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule.
*/
u64 max_open_gate_duration[TC_MAX_QUEUE];
u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries;
size_t num_entries;
ktime_t cycle_end_time;
s64 cycle_time;
s64 cycle_time_extension;
s64 base_time;
};
struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root;
u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio;
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte
*/
/* Protects the update side of the RCU protected current_entry */
spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE];
u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */
u32 txtime_delay;
};
/* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC.
*/
cur = entry;
do { if (!gates_still_open) break;
for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue;
cur = list_next_entry_circular(cur, &sched->entries, list);
} while (cur != entry);
/* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed.
*/ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] &&
sched->max_open_gate_duration[tc] < entry->gate_duration[tc])
sched->max_open_gate_duration[tc] = entry->gate_duration[tc];
}
}
if (*oper)
call_rcu(&(*oper)->rcu, taprio_free_sched_cb);
*oper = *admin;
*admin = NULL;
}
/* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
{
ktime_t time_since_sched_start;
s32 time_elapsed;
/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed.
*/ staticvoid taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab)
{ struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev);
u32 max_sdu_from_user;
u32 max_sdu_dynamic;
u32 max_sdu; int tc;
/* TC gate never closes => keep the queueMaxSDU * selected by the user
*/ if (sched->max_open_gate_duration[tc] == sched->cycle_time) {
max_sdu_dynamic = U32_MAX;
} else {
u32 max_frm_len;
max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative
*/ if (stab) {
max_frm_len -= stab->szopts.overhead;
max_frm_len = max_t(int, max_frm_len,
dev->hard_header_len + 1);
}
max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu)
max_sdu_dynamic = U32_MAX;
}
/* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
{ unsignedint offset = skb_network_offset(skb); conststruct ipv6hdr *ipv6h; conststruct iphdr *iph; struct ipv6hdr _ipv6h;
ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0;
/* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home
*/ if (iph->protocol == IPPROTO_IPV6) {
ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
/* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window.
*/ staticlong get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
{
ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin;
ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed;
now = taprio_get_time(q);
minimum_time = ktime_add_ns(now, q->txtime_delay);
/* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) {
txtime = minimum_time; goto done;
}
len = qdisc_pkt_len(skb);
packet_transmit_time = length_to_duration(q, len);
/* Update the txtime of current entry to the next time it's * interval starts.
*/ if (ktime_after(transmit_end_time, interval_end))
entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
} while (sched_changed || ktime_after(transmit_end_time, interval_end));
entry->next_txtime = transmit_end_time;
done:
rcu_read_unlock(); return txtime;
}
/* Devices with full offload are expected to honor this in hardware */ staticbool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb)
{ struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false;
u8 tc;
/* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free);
} elseif (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free);
}
/* FIXME: we should be segmenting to a smaller size * rather than dropping these
*/ if (taprio_skb_exceeds_queue_max_sdu(sch, segs))
ret = qdisc_drop(segs, sch, to_free); else
ret = taprio_enqueue_one(segs, sch, child, to_free);
if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
} else {
numsegs++;
}
}
if (numsegs > 1)
qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
consume_skb(skb);
/* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt()
*/ staticint taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{ struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue;
queue = skb_get_queue_mapping(skb);
child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free);
if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware.
*/ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child,
to_free);
for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time)
budget = INT_MAX; else
budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC,
atomic64_read(&q->picos_per_byte));
atomic_set(&entry->budget[tc], budget);
}
}
/* When an skb is sent, it consumes from the budget of all traffic classes */ staticint taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc)
{ int tc, budget, new_budget = 0;
for (tc = 0; tc < num_tc; tc++) {
budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed)
new_budget = budget; continue;
}
len = qdisc_pkt_len(skb);
guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len));
/* In the case that there's no gate entry, there's no * guard band ...
*/ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
!taprio_entry_allows_tx(guard, entry, tc)) return NULL;
/* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL;
skip_peek_checks:
skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL;
staticvoid taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
{ int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count;
/* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin
*/ staticstruct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry,
u32 gate_mask)
{ struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc;
for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc];
if (!(gate_mask & BIT(tc))) continue;
do {
skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
entry, gate_mask);
taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
if (q->cur_txq[tc] >= dev->num_tx_queues)
q->cur_txq[tc] = first_txq;
if (skb) return skb;
} while (q->cur_txq[tc] != first_txq);
}
return NULL;
}
/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not
*/ staticstruct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry,
u32 gate_mask)
{ struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i;
for (i = 0; i < dev->num_tx_queues; i++) {
skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb;
}
return NULL;
}
/* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt()
*/ staticstruct sk_buff *taprio_dequeue(struct Qdisc *sch)
{ struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry;
u32 gate_mask;
rcu_read_lock();
entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates"
*/
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done;
if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
!static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */
skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
} elseif (static_branch_likely(&taprio_have_working_mqprio) &&
!static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */
skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
} else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio)
skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else
skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
}
/* This is the simple case, the end_time would fall after * the next schedule base_time.
*/ if (ktime_compare(next_base_time, end_time) <= 0) returntrue;
/* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount.
*/
extension_time = ktime_add_ns(end_time, oper->cycle_time_extension);
/* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change.
*/ if (ktime_compare(next_base_time, extension_time) <= 0) returntrue;
/* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization.
*/ if (unlikely(!entry || entry->end_time == oper->base_time)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
end_time = next->end_time; goto first_run;
}
if (should_restart_cycle(oper, entry)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time,
oper->cycle_time);
} else {
next = list_next_entry(entry, list);
}
if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs.
*/
end_time = sched_base_time(admin);
switch_schedules(q, &admin, &oper);
}
if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
entry->command = nla_get_u8(
tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);
if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
entry->gate_mask = nla_get_u32(
tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);
if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
interval = nla_get_u32(
tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
/* The interval should allow at least the minimum ethernet * frame to go out.
*/ if (interval < min_duration) {
NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL;
}
if (!qopt) { if (!dev->num_tc) {
NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL;
} return 0;
}
/* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) {
NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL;
}
/* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges.
*/ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs,
extack);
}
base = sched_base_time(sched);
now = taprio_get_time(q);
if (ktime_after(base, now)) {
*start = base; return 0;
}
cycle = sched->cycle_time;
/* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error.
*/ if (WARN_ON(!cycle)) return -EFAULT;
/* Schedule the start time for the beginning of the next * cycle.
*/
n = div64_s64(ktime_sub_ns(now, base), cycle);
*start = ktime_add_ns(base, (n + 1) * cycle); return 0;
}
expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0)
expires = KTIME_MAX;
/* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time.
*/
start = min_t(ktime_t, start, expires);
staticvoid taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack)
{ struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err;
err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip;
if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
speed = ecmd.base.speed;
skip:
picos_per_byte = (USEC_PER_SEC * 8) / speed; if (picos_per_byte < TAPRIO_PICOS_PER_BYTE_MIN) { if (!extack)
pr_warn("Link speed %d is too high. Schedule may be inaccurate.\n",
speed);
NL_SET_ERR_MSG_FMT_MOD(extack, "Link speed %d is too high. Schedule may be inaccurate.",
speed);
picos_per_byte = TAPRIO_PICOS_PER_BYTE_MIN;
}
atomic64_set(&q->picos_per_byte, picos_per_byte);
netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
dev->name, (longlong)atomic64_read(&q->picos_per_byte),
ecmd.base.speed);
}
/* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO.
*/ staticvoid taprio_offload_config_changed(struct taprio_sched *q)
{ struct sched_gate_list *oper, *admin;
err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) {
NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done;
}
q->offloaded = true;
done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid
*/
offload->extack = NULL;
offload->mqprio.extack = NULL;
taprio_offload_free(offload);
offload = taprio_offload_alloc(0); if (!offload) {
NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM;
}
offload->cmd = TAPRIO_CMD_DESTROY;
err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) {
NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out;
}
q->offloaded = false;
out:
taprio_offload_free(offload);
return err;
}
/* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory.
*/ staticint taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack)
{ struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL;
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { conststruct ethtool_ops *ops = dev->ethtool_ops; struct kernel_ethtool_ts_info info = {
.cmd = ETHTOOL_GET_TS_INFO,
.phc_index = -1,
};
if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out;
}
if (ops && ops->get_ts_info)
err = ops->get_ts_info(dev, &info);
if (err || info.phc_index < 0) {
NL_SET_ERR_MSG(extack, "Device does not have a PTP clock");
err = -ENOTSUPP; goto out;
}
} elseif (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset;
/* We only support static clockids and we don't allow * for it to be modified after the first init.
*/ if (clockid < 0 ||
(q->clockid != -1 && q->clockid != clockid)) {
NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported");
err = -ENOTSUPP; goto out;
}
switch (clockid) { case CLOCK_REALTIME:
tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC:
tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME:
tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI:
tk_offset = TK_OFFS_TAI; break; default:
NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
err = -EINVAL; goto out;
} /* This pairs with READ_ONCE() in taprio_mono_to_any */
WRITE_ONCE(q->tk_offset, tk_offset);
q->clockid = clockid;
} else {
NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out;
}
/* Everything went ok, return success. */
err = 0;
if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) {
val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) {
NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE;
}
max_sdu[tc] = val;
}
if (tb[TCA_TAPRIO_TC_ENTRY_FP])
fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]);
if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) {
NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP;
}
if (!ethtool_dev_mm_supported(dev)) {
NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP;
}
}
return err;
}
staticint taprio_mqprio_cmp(conststruct net_device *dev, conststruct tc_mqprio_qopt *mqprio)
{ int i;
if (!mqprio || mqprio->num_tc != dev->num_tc) return -1;
for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] ||
dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1;
for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1;
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
/* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed.
*/
taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0);
/* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
(taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) {
NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); return -EINVAL;
}
if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) {
NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP;
}
q->flags = taprio_flags;
/* Needed for length_to_duration() during netlink attribute parsing */
taprio_set_picos_per_byte(dev, q, extack);
err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err;
new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) {
NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM;
}
INIT_LIST_HEAD(&new_admin->entries);
/* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio))
mqprio = NULL;
if (mqprio && (oper || admin)) {
NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
err = -ENOTSUPP; goto free_sched;
}
if (mqprio) {
err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) {
netdev_set_tc_queue(dev, i,
mqprio->count[i],
mqprio->offset[i]);
q->cur_txq[i] = mqprio->offset[i];
}
/* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++)
netdev_set_prio_tc_map(dev, i,
mqprio->prio_tc_map[i]);
}
if (FULL_OFFLOAD_IS_ENABLED(q->flags))
err = taprio_enable_offload(dev, q, new_admin, extack); else
err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched;
/* Protects against enqueue()/dequeue() */
spin_lock_bh(qdisc_lock(sch));
if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
err = -EINVAL; goto unlock;
}
err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) {
NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock;
}
setup_txtime(q, new_admin, start);
if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) {
rcu_assign_pointer(q->oper_sched, new_admin);
err = 0;
new_admin = NULL; goto unlock;
}
/* Not going to race against advance_sched(), but still */
admin = rcu_replace_pointer(q->admin_sched, new_admin,
lockdep_rtnl_is_held()); if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
} else {
setup_first_end_time(q, new_admin, start);
/* Protects against advance_sched() */
spin_lock_irqsave(&q->current_entry_lock, flags);
taprio_start_sched(sch, start, new_admin);
admin = rcu_replace_pointer(q->admin_sched, new_admin,
lockdep_rtnl_is_held()); if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
/* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called.
*/
hrtimer_cancel(&q->advance_timer);
qdisc_synchronize(sch);
taprio_disable_offload(dev, q, NULL);
if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++)
qdisc_put(q->qdiscs[i]);
/* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change().
*/
q->clockid = -1;
q->flags = TAPRIO_FLAGS_INVALID;
list_add(&q->taprio_list, &taprio_list);
if (sch->parent != TC_H_ROOT) {
NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP;
}
if (!netif_is_multiqueue(dev)) {
NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP;
}
q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]),
GFP_KERNEL); if (!q->qdiscs) return -ENOMEM;
if (!opt) return -EINVAL;
for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc;
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx];
/* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly
*/
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
dev_queue_qdisc = qdisc;
} else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue().
*/
dev_queue_qdisc = sch;
}
old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto
*/
qdisc_refcount_inc(dev_queue_qdisc); if (old)
qdisc_put(old);
}
}
/* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children.
*/
*old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new)
qdisc_refcount_inc(new); if (*old)
qdisc_put(*old);
}
/* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself
*/ if (!ops->ndo_setup_tc) return 0;
memset(stats, 0xff, sizeof(*stats));
err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err;
xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err;
if (taprio_put_stat(skb, stats->window_drops,
TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) ||
taprio_put_stat(skb, stats->tx_overruns,
TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.