/* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8
/* Total max dispatch from all groups in one round */ #define THROTL_QUANTUM 32
/* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ)
/* A workqueue to queue throttle related work */ staticstruct workqueue_struct *kthrotld_workqueue;
/** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest * * Return the throtl_grp @sq belongs to. If @sq is the top-level one * embedded in throtl_data, %NULL is returned.
*/ staticstruct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
{ if (sq && sq->parent_sq) return container_of(sq, struct throtl_grp, service_queue); else return NULL;
}
/** * sq_to_td - return throtl_data the specified service queue belongs to * @sq: the throtl_service_queue of interest * * A service_queue can be embedded in either a throtl_grp or throtl_data. * Determine the associated throtl_data accordingly and return it.
*/ staticstruct throtl_data *sq_to_td(struct throtl_service_queue *sq)
{ struct throtl_grp *tg = sq_to_tg(sq);
/** * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to * @sq: the service_queue @qn belongs to * * Add @bio to @qn and put @qn on @sq->queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details.
*/ staticvoid throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, struct throtl_service_queue *sq)
{ bool rw = bio_data_dir(bio);
/* * Split bios have already been throttled by bps, so they are * directly queued into the iops path.
*/ if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) ||
bio_flagged(bio, BIO_BPS_THROTTLED)) {
bio_list_add(&qn->bios_iops, bio);
sq->nr_queued_iops[rw]++;
} else {
bio_list_add(&qn->bios_bps, bio);
sq->nr_queued_bps[rw]++;
}
if (list_empty(&qn->node)) {
list_add_tail(&qn->node, &sq->queued[rw]);
blkg_get(tg_to_blkg(qn->tg));
}
}
/** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek * * Always take a bio from the head of the iops queue first. If the queue is * empty, we then take it from the bps queue to maintain the overall idea of * fetching bios from the head.
*/ staticstruct bio *throtl_peek_queued(struct list_head *queued)
{ struct throtl_qnode *qn; struct bio *bio;
if (list_empty(queued)) return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_peek(&qn->bios_iops); if (!bio)
bio = bio_list_peek(&qn->bios_bps);
WARN_ON_ONCE(!bio); return bio;
}
/** * throtl_pop_queued - pop the first bio form a qnode list * @sq: the service_queue to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put * @rw: read/write * * Pop the first bio from the qnode list @sq->queued. Note that we firstly * focus on the iops list because bios are ultimately dispatched from it. * After popping, the first qnode is removed from @sq->queued if empty or moved * to the end of @sq->queued so that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it.
*/ staticstruct bio *throtl_pop_queued(struct throtl_service_queue *sq, struct throtl_grp **tg_to_put, bool rw)
{ struct list_head *queued = &sq->queued[rw]; struct throtl_qnode *qn; struct bio *bio;
if (list_empty(queued)) return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_pop(&qn->bios_iops); if (bio) {
sq->nr_queued_iops[rw]--;
} else {
bio = bio_list_pop(&qn->bios_bps); if (bio)
sq->nr_queued_bps[rw]--;
}
WARN_ON_ONCE(!bio);
if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) {
list_del_init(&qn->node); if (tg_to_put)
*tg_to_put = qn->tg; else
blkg_put(tg_to_blkg(qn->tg));
} else {
list_move_tail(&qn->node, queued);
}
return bio;
}
/* init a service_queue, assumes the caller zeroed it */ staticvoid throtl_service_queue_init(struct throtl_service_queue *sq)
{
INIT_LIST_HEAD(&sq->queued[READ]);
INIT_LIST_HEAD(&sq->queued[WRITE]);
sq->pending_tree = RB_ROOT_CACHED;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
/* * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M * read_bps limit is set on a parent group, summary bps of * parent group and its subtree groups can't exceed 16M for the * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if * they're all separate root groups right below throtl_data. * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy.
*/
sq->parent_sq = &td->service_queue; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
tg->td = td;
}
/* * Set has_rules[] if @tg or any of its parents have limits configured. * This doesn't require walking up to the top of the hierarchy as the * parent's has_rules[] is guaranteed to be correct.
*/ staticvoid tg_update_has_rules(struct throtl_grp *tg)
{ struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); int rw;
staticvoid throtl_pd_online(struct blkg_policy_data *pd)
{ struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online.
*/
tg_update_has_rules(tg);
}
/* * Since we are adjusting the throttle limit dynamically, the sleep * time calculated according to previous limit might be invalid. It's * possible the cgroup sleep time is very long and no other cgroups * have IO running so notify the limit changes. Make sure the cgroup * doesn't sleep too long to avoid the missed notification.
*/ if (time_after(expires, max_expire))
expires = max_expire;
mod_timer(&sq->pending_timer, expires);
throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
expires - jiffies, jiffies);
}
/** * throtl_schedule_next_dispatch - schedule the next dispatch cycle * @sq: the service_queue to schedule dispatch for * @force: force scheduling * * Arm @sq->pending_timer so that the next dispatch cycle starts on the * dispatch time of the first pending child. Returns %true if either timer * is armed or there's no pending child left. %false if the current * dispatch window is still open and the caller should continue * dispatching. * * If @force is %true, the dispatch timer is always scheduled and this * function is guaranteed to return %true. This is to be used when the * caller can't dispatch itself and needs to invoke pending_timer * unconditionally. Note that forced scheduling is likely to induce short * delay before dispatch starts even if @sq->first_pending_disptime is not * in the future and thus shouldn't be used in hot paths.
*/ staticbool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, bool force)
{ /* any pending children left? */ if (!sq->nr_pending) returntrue;
update_min_dispatch_time(sq);
/* is the next dispatch time in the future? */ if (force || time_after(sq->first_pending_disptime, jiffies)) {
throtl_schedule_pending_timer(sq, sq->first_pending_disptime); returntrue;
}
/* tell the caller to continue dispatching */ returnfalse;
}
/* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used * that bandwidth. Do try to make use of that bandwidth while giving * credit.
*/ if (time_after(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
/* Determine if previously allocated or extended slice is complete or not */ staticbool throtl_slice_used(struct throtl_grp *tg, bool rw)
{ if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) returnfalse;
/* * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed.
*/
static u64 calculate_bytes_allowed(u64 bps_limit, unsignedlong jiffy_elapsed)
{ /* * Can result be wider than 64 bits? * We check against 62, not 64, due to ilog2 truncation.
*/ if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) return U64_MAX; return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
/* Need to consider the case of io_allowed overflow. */
io_trim = calculate_io_allowed(iops_limit, time_elapsed); if (io_trim <= 0 || tg->io_disp[rw] < io_trim) {
io_trim = tg->io_disp[rw];
tg->io_disp[rw] = 0;
} else {
tg->io_disp[rw] -= io_trim;
}
return io_trim;
}
/* Trim the used slices and adjust slice start accordingly */ staticinlinevoid throtl_trim_slice(struct throtl_grp *tg, bool rw)
{ unsignedlong time_elapsed; longlong bytes_trim; int io_trim;
/* * If bps are unlimited (-1), then time slice don't get * renewed. Don't try to trim the slice if slice is used. A new * slice will start when appropriate.
*/ if (throtl_slice_used(tg, rw)) return;
/* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start.
*/
throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
time_elapsed = rounddown(jiffies - tg->slice_start[rw],
tg->td->throtl_slice); /* Don't trim slice until at least 2 slices are used */ if (time_elapsed < tg->td->throtl_slice * 2) return;
/* * The bio submission time may be a few jiffies more than the expected * waiting time, due to 'extra_bytes' can't be divided in * tg_within_bps_limit(), and also due to timer wakeup delay. In this * case, adjust slice_start will discard the extra wait time, causing * lower rate than expected. Therefore, other than the above rounddown, * one extra slice is preserved for deviation.
*/
time_elapsed -= tg->td->throtl_slice;
bytes_trim = throtl_trim_bps(tg, rw, time_elapsed);
io_trim = throtl_trim_iops(tg, rw, time_elapsed); if (!bytes_trim && !io_trim) return;
/* * If the queue is empty, carryover handling is not needed. In such cases, * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch * of subsequent bios. The same handling applies when the previous BPS/IOPS * limit was set to max.
*/ if (sq_queued(&tg->service_queue, rw) == 0) {
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0; return;
}
/* * If config is updated while bios are still throttled, calculate and * accumulate how many bytes/ios are waited across changes. And use the * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which * will be used to calculate new wait time under new configuration. * And we need to consider the case of bytes/io_allowed overflow.
*/ if (bps_limit != U64_MAX) {
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed); if (bytes_allowed > 0)
*bytes = bytes_allowed - tg->bytes_disp[rw];
} if (iops_limit != UINT_MAX) {
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed); if (io_allowed > 0)
*ios = io_allowed - tg->io_disp[rw];
}
/* see comments in struct throtl_grp for meaning of carryover. */
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
}
staticunsignedlong tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
u32 iops_limit)
{ bool rw = bio_data_dir(bio); int io_allowed; unsignedlong jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0;
/* Calc approx time to dispatch */
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
/* make sure at least one io can be dispatched after waiting */
jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait;
}
/* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed)
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); /* Need to consider the case of bytes_allowed overflow. */ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
|| bytes_allowed < 0) return 0;
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
if (!jiffy_wait)
jiffy_wait = 1;
/* * This wait time is without taking into consideration the rounding * up we did. Add that time also.
*/
jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); return jiffy_wait;
}
/* Charge the bio to the group */ if (!bio_flagged(bio, BIO_BPS_THROTTLED) &&
!bio_flagged(bio, BIO_TG_BPS_THROTTLED)) {
bio_set_flag(bio, BIO_TG_BPS_THROTTLED);
tg->bytes_disp[bio_data_dir(bio)] += bio_size;
}
}
/* * If previous slice expired, start a new one otherwise renew/extend existing * slice to make sure it is at least throtl_slice interval long since now. New * slice is started only for empty throttle group. If there is queued bio, that * means there should be an active slice and it should be extended instead.
*/ staticvoid tg_update_slice(struct throtl_grp *tg, bool rw)
{ if (throtl_slice_used(tg, rw) &&
sq_queued(&tg->service_queue, rw) == 0)
throtl_start_new_slice(tg, rw, true); else
throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
}
/* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING ||
bio_flagged(bio, BIO_BPS_THROTTLED) ||
bio_flagged(bio, BIO_TG_BPS_THROTTLED)) return 0;
/* * Returns approx number of jiffies to wait before this bio is with-in IO rate * and can be moved to other queue or dispatched.
*/ staticunsignedlong tg_dispatch_time(struct throtl_grp *tg, struct bio *bio)
{ bool rw = bio_data_dir(bio); unsignedlong wait;
/* * Currently whole state machine of group depends on first bio * queued in the group bio list. So one should not be calling * this function with a different bio if there are other bios * queued.
*/
BUG_ON(sq_queued(&tg->service_queue, rw) &&
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
wait = tg_dispatch_bps_time(tg, bio); if (wait != 0) return wait;
/* * Charge bps here because @bio will be directly placed into the * iops queue afterward.
*/
throtl_charge_bps_bio(tg, bio);
return tg_dispatch_iops_time(tg, bio);
}
/** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add * @qn: qnode to use * @tg: the target throtl_grp * * Add @bio to @tg's service_queue using @qn. If @qn is not specified, * tg->qnode_on_self[] is used.
*/ staticvoid throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, struct throtl_grp *tg)
{ struct throtl_service_queue *sq = &tg->service_queue; bool rw = bio_data_dir(bio);
if (!qn)
qn = &tg->qnode_on_self[rw];
/* * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime().
*/ if (sq_queued(sq, rw) == 0)
tg->flags |= THROTL_TG_WAS_EMPTY;
throtl_qnode_add_bio(bio, qn, sq);
/* * Since we have split the queues, when the iops queue is * previously empty and a new @bio is added into the first @qn, * we also need to update the @tg->disptime.
*/ if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
bio == throtl_peek_queued(&sq->queued[rw]))
tg->flags |= THROTL_TG_IOPS_WAS_EMPTY;
/* * @bio is being transferred from @tg to @parent_sq. Popping a bio * from @tg may put its reference and @parent_sq might end up * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq.
*/
bio = throtl_pop_queued(sq, &tg_to_put, rw);
throtl_charge_iops_bio(tg, bio);
/* * If our parent is another tg, we just need to transfer @bio to * the parent using throtl_add_bio_tg(). If our parent is * @td->service_queue, @bio is ready to be issued. Put it on its * bio_lists[] and decrease total number queued. The caller is * responsible for issuing these bios.
*/ if (parent_tg) {
throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
start_parent_slice_with_credit(tg, parent_tg, rw);
} else {
bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
parent_sq);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
}
/** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced * * This timer is armed when a child throtl_grp with active bio's become * pending and queued on the service_queue's pending_tree and expires when * the first child throtl_grp should be dispatched. This function * dispatches bio's from the children throtl_grps to the parent * service_queue. * * If the parent's parent is another throtl_grp, dispatching is propagated * by either arming its pending_timer or repeating dispatch directly. If * the top-level service_tree is reached, throtl_data->dispatch_work is * kicked so that the ready bio's are issued.
*/ staticvoid throtl_pending_timer_fn(struct timer_list *t)
{ struct throtl_service_queue *sq = timer_container_of(sq, t,
pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); struct throtl_service_queue *parent_sq; struct request_queue *q; bool dispatched; int ret;
/* throtl_data may be gone, so figure out request queue by blkg */ if (tg)
q = tg->pd.blkg->q; else
q = td->queue;
ret = throtl_select_dispatch(sq); if (ret) {
throtl_log(sq, "bios disp=%u", ret);
dispatched = true;
}
if (throtl_schedule_next_dispatch(sq, false)) break;
/* this dispatch windows is still open, relax and repeat */
spin_unlock_irq(&q->queue_lock);
cpu_relax();
spin_lock_irq(&q->queue_lock);
}
if (!dispatched) goto out_unlock;
if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ if (tg->flags & THROTL_TG_WAS_EMPTY ||
tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */
sq = parent_sq;
tg = sq_to_tg(sq); goto again;
}
}
} else { /* reached the top-level, queue issuing */
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
spin_unlock_irq(&q->queue_lock);
}
/** * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * * This function is queued for execution when bios reach the bio_lists[] * of throtl_data->service_queue. Those bios are ready and issued by this * function.
*/ staticvoid blk_throtl_dispatch_work_fn(struct work_struct *work)
{ struct throtl_data *td = container_of(work, struct throtl_data,
dispatch_work); struct throtl_service_queue *td_sq = &td->service_queue; struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; struct blk_plug plug; int rw;
bio_list_init(&bio_list_on_stack);
spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(&q->queue_lock);
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack)))
submit_bio_noacct_nocheck(bio, false);
blk_finish_plug(&plug);
}
}
rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its * ancestors has rules. This identifies groups without any * restrictions in the whole hierarchy and allows them to bypass * blk-throttle.
*/
blkg_for_each_descendant_pre(blkg, pos_css,
global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg);
/* * We're already holding queue_lock and know @tg is valid. Let's * apply the new config directly. * * Restart the slices for both READ and WRITES. It might happen * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate.
*/
throtl_start_new_slice(tg, READ, false);
throtl_start_new_slice(tg, WRITE, false);
if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(sq->parent_sq, true);
}
}
if (tg->flags & THROTL_TG_CANCELING) return; /* * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched.
*/
tg->flags |= THROTL_TG_CANCELING;
/* * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup * will be inserted to service queue without THROTL_TG_PENDING * set in tg_update_disptime below. Then IO dispatched from * child in tg_dispatch_one_bio will trigger double insertion * and corrupt the tree.
*/ if (!(tg->flags & THROTL_TG_PENDING)) return;
/* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching.
*/
tg_update_disptime(tg);
spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. * However, rcu lock is still held to emphasize that following * path need RCU protection and to prevent warning from lockdep.
*/
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { /* * disk_release will call pd_offline_fn to cancel bios. * However, disk_release can't be called if someone get * the refcount of device and issued bios which are * inflight after del_gendisk. * Cancel bios here to ensure no bios are inflight after * del_gendisk.
*/
tg_flush_bios(blkg_to_tg(blkg));
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
}
/* * For a split bio, we need to specifically distinguish whether the * iops queue is empty.
*/ if (bio_flagged(bio, BIO_BPS_THROTTLED)) return sq->nr_queued_iops[rw] == 0 &&
tg_dispatch_iops_time(tg, bio) == 0;
/* * Throtl is FIFO - if bios are already queued, should queue. * If the bps queue is empty and @bio is within the bps limit, charge * bps here for direct placement into the iops queue.
*/ if (sq_queued(&tg->service_queue, rw)) { if (sq->nr_queued_bps[rw] == 0 &&
tg_dispatch_bps_time(tg, bio) == 0)
throtl_charge_bps_bio(tg, bio);
while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */
throtl_charge_iops_bio(tg, bio);
/* * We need to trim slice even when bios are not being * queued otherwise it might happen that a bio is not * queued for a long time and slice keeps on extending * and trim is not called for a long time. Now if limits * are reduced suddenly we take into account all the IO * dispatched so far at new low rate and * newly queued * IO gets a really long dispatch time. * * So keep on trimming slice even if bio is not queued.
*/
throtl_trim_slice(tg, rw);
} elseif (bio_issue_as_root_blkg(bio)) { /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. * * Charge and dispatch directly, and our throttle * control algorithm is adaptive, and extra IO bytes * will be throttled for paying the debt
*/
throtl_charge_bps_bio(tg, bio);
throtl_charge_iops_bio(tg, bio);
} else { /* if above limits, break to queue */ break;
}
/* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it * can be executed directly.
*/
qn = &tg->qnode_on_parent[rw];
sq = sq->parent_sq;
tg = sq_to_tg(sq); if (!tg) {
bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock;
}
}
/* * Update @tg's dispatch time and force schedule dispatch if @tg * was empty before @bio, or the iops queue is empty and @bio will * add to. The forced scheduling isn't likely to cause undue * delay as @bio is likely to be dispatched directly if its @tg's * disptime is not in the future.
*/ if (tg->flags & THROTL_TG_WAS_EMPTY ||
tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.