/* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue.
*/ staticbool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{ return !list_empty_careful(&hctx->dispatch) ||
sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
}
/* * Mark this ctx as having pending work in this hardware queue
*/ staticvoid blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx)
{ constint bit = ctx->index_hw[hctx->type];
if (!sbitmap_test_bit(&hctx->ctx_map, bit))
sbitmap_set_bit(&hctx->ctx_map, bit);
}
/* * non_owner variant of blk_freeze_queue_start * * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen * by the same task. This is fragile and should not be used if at all * possible.
*/ void blk_freeze_queue_start_non_owner(struct request_queue *q)
{
__blk_freeze_queue_start(q, NULL);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
/* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed.
*/ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{ unsignedlong flags;
spin_lock_irqsave(&q->queue_lock, flags); if (!q->quiesce_depth++)
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has * been started on or more of the request_queues of the tag_set. This * function only waits for the quiesce on those request_queues that had * the quiesce flag set using blk_mq_quiesce_queue_nowait.
*/ void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{ if (set->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(set->srcu); else
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
/** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() * callback function is invoked. Once this function is returned, we make * sure no dispatch can happen until the queue is unquiesced via * blk_mq_unquiesce_queue().
*/ void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q))
blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
/* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. * * This function recovers queue into the state before quiescing * which is done by blk_mq_quiesce_queue.
*/ void blk_mq_unquiesce_queue(struct request_queue *q)
{ unsignedlong flags; bool run_queue = false;
/* Set start and alloc time when the allocated request is actually used */ staticinlinevoid blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{ #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q))
rq->alloc_time_ns = alloc_time_ns; else
rq->alloc_time_ns = 0; #endif
}
if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is * enabled for the queue.
*/
data->rq_flags |= RQF_SCHED_TAGS;
/* * Flush/passthrough requests are special and go directly to the * dispatch list.
*/ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
!blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops;
if (data->flags & BLK_MQ_REQ_RESERVED)
data->rq_flags |= RQF_RESV;
/* * Try batched alloc if we want more than 1 tag.
*/ if (data->nr_tags > 1) {
rq = __blk_mq_alloc_requests_batch(data); if (rq) {
blk_mq_rq_time_init(rq, alloc_time_ns); return rq;
}
data->nr_tags = 1;
}
/* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now.
*/
tag = blk_mq_get_tag(data); if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; /* * Give up the CPU and sleep for a random short time to * ensure that thread using a realtime scheduling class * are migrated off the CPU, and thus off the hctx that * is going away.
*/
msleep(3); goto retry;
}
/* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q))
alloc_time_ns = blk_time_get_ns();
/* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue.
*/ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO);
ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret);
/* * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
if (q->elevator)
data.rq_flags |= RQF_SCHED_TAGS; else
blk_mq_tag_busy(data.hctx);
if (flags & BLK_MQ_REQ_RESERVED)
data.rq_flags |= RQF_RESV;
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; if (!(data.rq_flags & RQF_SCHED_TAGS))
blk_mq_inc_active_requests(data.hctx);
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL; return rq;
if (rq->rq_flags & RQF_USE_SCHED) {
q->elevator->type->ops.finish_request(rq); /* * For postflush request that may need to be * completed twice, we should clear this flag * to avoid double finish_request() on the rq.
*/
rq->rq_flags &= ~RQF_USE_SCHED;
}
}
/* * Fully end IO on a request. Does not support partial completions, or * errors.
*/ staticvoid blk_complete_request(struct request *req)
{ constbool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; int total_bytes = blk_rq_bytes(req); struct bio *bio = req->bio;
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
blk_integrity_complete(req, total_bytes);
/* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that.
*/
blk_crypto_rq_put_keyslot(req);
blk_account_io_completion(req, total_bytes);
do { struct bio *next = bio->bi_next;
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
if (blk_req_bio_is_zone_append(req, bio))
blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
bio = next;
} while (bio);
/* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later.
*/ if (!req->end_io) {
req->bio = NULL;
req->__data_len = 0;
}
}
/** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data
**/ bool blk_update_request(struct request *req, blk_status_t error, unsignedint nr_bytes)
{ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; bool quiet = req->rq_flags & RQF_QUIET; int total_bytes;
/* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that.
*/ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
if (unlikely(error))
bio->bi_status = error;
if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
} elseif (bio_is_zone_append(bio) && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported * as the BIO fragments may end up not being written * sequentially.
*/
bio->bi_status = BLK_STS_IOERR;
}
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (unlikely(quiet))
bio_set_flag(bio, BIO_QUIET);
bio_advance(bio, bio_bytes);
/* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { if (blk_req_bio_is_zone_append(req, bio))
blk_zone_append_update_request_bio(req, bio); if (!is_flush)
bio_endio(bio);
}
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
if (!nr_bytes) break;
}
/* * completely done
*/ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later.
*/
req->__data_len = 0; returnfalse;
}
req->__data_len -= total_bytes;
/* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req))
req->__sector += total_bytes >> 9;
/* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) {
req->cmd_flags &= ~REQ_FAILFAST_MASK;
req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
}
if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong.
*/ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
blk_dump_rq_flags(req, "request botched");
req->__data_len = blk_rq_cur_bytes(req);
}
/* recalculate the number of segments */
req->nr_phys_segments = blk_recalc_rq_segments(req);
}
/* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough.
*/ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { constint sgrp = op_stat_group(req_op(req));
staticinlinebool blk_rq_passthrough_stats(struct request *req)
{ struct bio *bio = req->bio;
if (!blk_queue_passthrough_stat(req->q)) returnfalse;
/* Requests without a bio do not transfer data. */ if (!bio) returnfalse;
/* * Stats are accumulated in the bdev, so must have one attached to a * bio to track stats. Most drivers do not set the bdev for passthrough * requests, but nvme is one that will set it.
*/ if (!bio->bi_bdev) returnfalse;
/* * We don't know what a passthrough command does, but we know the * payload size and data direction. Ensuring the size is aligned to the * block size filters out most commands with payloads that don't * represent sector access.
*/ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) returnfalse; returntrue;
}
/* * All non-passthrough requests are created from a bio with one * exception: when a flush command that is part of a flush sequence * generated by the state machine in blk-flush.c is cloned onto the * lower device by dm-multipath we can get here without a bio.
*/ if (req->bio)
req->part = req->bio->bi_bdev; else
req->part = req->q->disk->part0;
staticinlinebool blk_mq_complete_need_ipi(struct request *rq)
{ int cpu = raw_smp_processor_id();
if (!IS_ENABLED(CONFIG_SMP) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) returnfalse; /* * With force threaded interrupts enabled, raising softirq from an SMP * function call will always result in waking the ksoftirqd thread. * This is probably worse than completing the request on a different * cache domain.
*/ if (force_irqthreads()) returnfalse;
/* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) returnfalse;
/* don't try to IPI to an offline CPU */ return cpu_online(rq->mq_ctx->cpu);
}
/* * For request which hctx has only one ctx mapping, * or a polled request, always complete locally, * it's pointless to redirect the completion.
*/ if ((rq->mq_hctx->nr_ctx == 1 &&
rq->mq_ctx->cpu == raw_smp_processor_id()) ||
rq->cmd_flags & REQ_POLLED) returnfalse;
if (blk_mq_complete_need_ipi(rq)) {
blk_mq_complete_send_ipi(rq); returntrue;
}
/** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: * Complete a request by scheduling the ->complete_rq operation.
**/ void blk_mq_complete_request(struct request *rq)
{ if (!blk_mq_complete_request_remote(rq))
rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
/** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * * Function used by device drivers to notify the block layer that a request * is going to be processed now, so blk layer can do proper initializations * such as starting the timeout timer.
*/ void blk_mq_start_request(struct request *rq)
{ struct request_queue *q = rq->q;
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
blk_integrity_prepare(rq);
if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
EXPORT_SYMBOL(blk_mq_start_request);
/* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests.
*/ staticinlineunsignedshort blk_plug_max_rq_count(struct blk_plug *plug)
{ if (plug->multiple_queues) return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT;
}
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true; /* * Any request allocated from sched tags can't be issued to * ->queue_rqs() directly
*/ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
rq_list_add_tail(&plug->mq_list, rq);
plug->rq_count++;
}
/** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. * * Note: * This function will invoke @done directly if the queue is dead.
*/ void blk_execute_rq_nowait(struct request *rq, bool at_head)
{ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
bool blk_rq_is_poll(struct request *rq)
{ if (!rq->mq_hctx) returnfalse; if (rq->mq_hctx->type != HCTX_TYPE_POLL) returnfalse; returntrue;
}
EXPORT_SYMBOL_GPL(blk_rq_is_poll);
staticvoid blk_rq_poll_completion(struct request *rq, struct completion *wait)
{ do {
blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
cond_resched();
} while (!completion_done(wait));
}
/** * blk_execute_rq - insert a request into queue for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request().
*/
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{ struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = {
.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
};
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist); /* * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request.
*/ if (rq->rq_flags & RQF_DONTPREP)
blk_mq_request_bypass_insert(rq, 0); else
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
}
staticbool blk_mq_rq_inflight(struct request *rq, void *priv)
{ /* * If we find a request that isn't idle we know the queue is busy * as it's checked in the iter. * Return false to stop the iteration. * * In case of queue quiesce, if one flush data request is completed, * don't count it as inflight given the flush sequence is suspended, * and the original flush data request is invisible to driver, just * like other pending requests because of quiesce
*/ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
blk_is_flush_data_rq(rq) &&
blk_mq_request_completed(rq))) { bool *busy = priv;
/* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot * be reallocated underneath the timeout handler's processing, then * the expire check is reliable. If the request is not expired, then * it was completed and reallocated as a new request after returning * from blk_mq_check_expired().
*/ if (blk_mq_req_expired(rq, expired)) {
expired->has_timedout_rq = true; returnfalse;
} returntrue;
}
/* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting * completion, since the timeout code would not be able to * acquire the queue reference here. * * That's why we don't use blk_queue_enter here; instead, we use * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero.
*/ if (!percpu_ref_tryget(&q->q_usage_counter)) return;
/* check if there is any timed-out request */
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); if (expired.has_timedout_rq) { /* * Before walking tags, we must ensure any submit started * before the current time has finished. Since the submit * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished
*/
blk_mq_wait_quiesce_done(q->tag_set);
if (expired.next != 0) {
mod_timer(&q->timeout, expired.next);
} else { /* * Request timeouts are handled as a forward rolling timer. If * we end up here it means that no requests are pending and * also that no request has been pending for a while. Mark * each hctx as idle.
*/
queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
}
blk_queue_exit(q);
}
/* * Process software queues that have been marked busy, splicing them * to the for-dispatch
*/ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{ struct flush_busy_ctx_data data = {
.hctx = hctx,
.list = list,
};
/* * Mark us waiting for a tag. For shared tags, this involves hooking us into * the tag wakeups. For non-shared tags, we can simply mark us needing a * restart. For both cases, take care to check the condition again after * marking us as waiting.
*/ staticbool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq)
{ struct sbitmap_queue *sbq; struct wait_queue_head *wq;
wait_queue_entry_t *wait; bool ret;
if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
!(blk_mq_is_shared_tags(hctx->flags))) {
blk_mq_sched_mark_restart_hctx(hctx);
/* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. * * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run.
*/ return blk_mq_get_driver_tag(rq);
}
wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) returnfalse;
/* * Add one explicit barrier since blk_mq_get_driver_tag() may * not imply barrier in case of failure. * * Order adding us to wait queue and allocating driver tag. * * The pair is the one implied in sbitmap_queue_wake_up() which * orders clearing sbitmap tag bits and waitqueue_active() in * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless * * Otherwise, re-order of adding wait queue and getting driver tag * may cause __sbitmap_queue_wake_up() to wake up nothing because * the waitqueue_active() may not observe us in wait queue.
*/
smp_mb();
/* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue.
*/
ret = blk_mq_get_driver_tag(rq); if (!ret) {
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock); returnfalse;
}
/* * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup.
*/
list_del_init(&wait->entry);
atomic_dec(&sbq->ws_active);
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
returntrue;
}
#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 /* * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): * - EWMA is one simple way to compute running average value * - weight(7/8 and 1/8) is applied so that it can decrease exponentially * - take 4 as factor for avoiding to get too small(0) result, and this * factor doesn't matter because EWMA decreases exponentially
*/ staticvoid blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{ unsignedint ewma;
if (need_budget) {
budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) {
blk_mq_put_driver_tag(rq); return PREP_DISPATCH_NO_BUDGET;
}
blk_mq_set_rq_budget_token(rq, budget_token);
}
if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The * waitqueue takes care of that. If the queue is run * before we add this entry back on the dispatch list, * we'll re-run it below.
*/ if (!blk_mq_mark_tag_wait(hctx, rq)) { /* * All budgets not got from this function will be put * together during handling partial dispatch
*/ if (need_budget)
blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG;
}
}
return PREP_DISPATCH_OK;
}
/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ staticvoid blk_mq_release_budgets(struct request_queue *q, struct list_head *list)
{ struct request *rq;
list_for_each_entry(rq, list, queuelist) { int budget_token = blk_mq_get_rq_budget_token(rq);
if (budget_token >= 0)
blk_mq_put_dispatch_budget(q, budget_token);
}
}
/* * blk_mq_commit_rqs will notify driver using bd->last that there is no * more requests. (See comment in struct blk_mq_ops for commit_rqs for * details) * Attention, we should explicitly call this in unusual cases: * 1) did not queue everything initially scheduled to queue * 2) the last attempt to queue a request failed
*/ staticvoid blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, bool from_schedule)
{ if (hctx->queue->mq_ops->commit_rqs && queued) {
trace_block_unplug(hctx->queue, queued, !from_schedule);
hctx->queue->mq_ops->commit_rqs(hctx);
}
}
/* * Returns true if we did some work AND can potentially do more.
*/ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool get_budget)
{ enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued;
blk_status_t ret = BLK_STS_OK; bool needs_resource = false;
if (list_empty(list)) returnfalse;
/* * Now process all the entries, sending them to the driver.
*/
queued = 0; do { struct blk_mq_queue_data bd;
ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK:
queued++; break; case BLK_STS_RESOURCE:
needs_resource = true;
fallthrough; case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list); goto out; default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie.
*/ if (!list_empty(list) || ret != BLK_STS_OK)
blk_mq_commit_rqs(hctx, queued, false);
/* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run.
*/ if (!list_empty(list)) { bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
blk_mq_is_shared_tags(hctx->flags));
/* * If the caller allocated budgets, free the budgets of the * requests that have not yet been passed to the block driver.
*/ if (!get_budget)
blk_mq_release_budgets(q, list);
/* * Order adding requests to hctx->dispatch and checking * SCHED_RESTART flag. The pair of this smp_mb() is the one * in blk_mq_sched_restart(). Avoid restart code path to * miss the new added requests to hctx->dispatch, meantime * SCHED_RESTART is observed here.
*/
smp_mb();
/* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If 'no_tag' is set, that means that we failed getting * a driver tag with an I/O scheduler attached. If our dispatch * waitqueue is no longer active, ensure that we run the queue * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do * similar if we couldn't get budget or couldn't lock a zone * and SCHED_RESTART is set.
*/
needs_restart = blk_mq_sched_needs_restart(hctx); if (prep == PREP_DISPATCH_NO_BUDGET)
needs_resource = true; if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true); elseif (needs_resource)
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
staticinlineint blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{ int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(hctx->cpumask); return cpu;
}
/* * ->next_cpu is always calculated from hctx->cpumask, so simply use * it for speeding up the check
*/ staticbool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
{ return hctx->next_cpu >= nr_cpu_ids;
}
/* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every * BLK_MQ_CPU_WORK_BATCH queued items.
*/ staticint blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{ bool tried = false; int next_cpu = hctx->next_cpu;
/* Switch to unbound if no allowable CPUs in this hctx */ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND;
/* * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD.
*/ if (!cpu_online(next_cpu)) { if (!tried) {
tried = true; goto select_cpu;
}
/* * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again.
*/
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND;
}
hctx->next_cpu = next_cpu; return next_cpu;
}
/** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs.
*/ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsignedlong msecs)
{ if (unlikely(blk_mq_hctx_stopped(hctx))) return;
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
/* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced.
*/
__blk_mq_run_dispatch_ops(hctx->queue, false,
need_run = !blk_queue_quiesced(hctx->queue) &&
blk_mq_hctx_has_pending(hctx)); return need_run;
}
/** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. * * Check if the request queue is not in a quiesced state and if there are * pending requests to be sent. If this is true, run the queue to send requests * to hardware.
*/ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{ bool need_run;
/* * We can't run the queue inline with interrupts disabled.
*/
WARN_ON_ONCE(!async && in_interrupt());
need_run = blk_mq_hw_queue_need_run(hctx); if (!need_run) { unsignedlong flags;
/* * Synchronize with blk_mq_unquiesce_queue(), because we check * if hw queue is quiesced locklessly above, we need the use * ->queue_lock to make sure we see the up-to-date status to * not miss rerunning the hw queue.
*/
spin_lock_irqsave(&hctx->queue->queue_lock, flags);
need_run = blk_mq_hw_queue_need_run(hctx);
spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
if (!need_run) return;
}
if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0); return;
}
/* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler.
*/ staticstruct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
{ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and * dispatch from hctx for the current CPU since running multiple queues * just causes lock contention inside the scheduler and pointless cache * bouncing.
*/ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL;
}
/** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously.
*/ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{ struct blk_mq_hw_ctx *hctx, *sq_hctx; unsignedlong i;
sq_hctx = NULL; if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler.
*/ if (!sq_hctx || sq_hctx == hctx ||
!list_empty_careful(&hctx->dispatch))
blk_mq_run_hw_queue(hctx, async);
}
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);
/** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. * @msecs: Milliseconds of delay to wait before running the queues.
*/ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsignedlong msecs)
{ struct blk_mq_hw_ctx *hctx, *sq_hctx; unsignedlong i;
sq_hctx = NULL; if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * If there is already a run_work pending, leave the * pending delay untouched. Otherwise, a hctx can stall * if another hctx is re-delaying the other's work * before the work executes.
*/ if (delayed_work_pending(&hctx->run_work)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler.
*/ if (!sq_hctx || sq_hctx == hctx ||
!list_empty_careful(&hctx->dispatch))
blk_mq_delay_run_hw_queue(hctx, msecs);
}
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
/* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use * blk_mq_quiesce_queue() for that requirement.
*/ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
cancel_delayed_work(&hctx->run_work);
/* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.63 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.