/* Keep rqf_name[] in sync with the definitions below */ enum rqf_flags { /* drive already may have started this one */
__RQF_STARTED, /* request for flush sequence */
__RQF_FLUSH_SEQ, /* merge of different types, fail separately */
__RQF_MIXED_MERGE, /* don't call prep for this one */
__RQF_DONTPREP, /* use hctx->sched_tags */
__RQF_SCHED_TAGS, /* use an I/O scheduler for this request */
__RQF_USE_SCHED, /* vaguely specified driver internal error. Ignored by block layer */
__RQF_FAILED, /* don't warn about errors */
__RQF_QUIET, /* account into disk and partition IO statistics */
__RQF_IO_STAT, /* runtime pm request */
__RQF_PM, /* on IO scheduler merge hash */
__RQF_HASHED, /* track IO completion time */
__RQF_STATS, /* Look at ->special_vec for the actual data payload instead of the
bio chain. */
__RQF_SPECIAL_PAYLOAD, /* request completion needs to be signaled to zone write plugging. */
__RQF_ZONE_WRITE_PLUGGING, /* ->timeout has been called, don't expire again */
__RQF_TIMED_OUT,
__RQF_RESV,
__RQF_BITS
};
/* * Try to put the fields that are referenced together in the same cacheline. * * If you modify this structure, make sure to update blk_rq_init() and * especially blk_mq_rq_ctx_init() to take care of the added fields.
*/ struct request { struct request_queue *q; struct blk_mq_ctx *mq_ctx; struct blk_mq_hw_ctx *mq_hctx;
blk_opf_t cmd_flags; /* op and common flags */
req_flags_t rq_flags;
int tag; int internal_tag;
unsignedint timeout;
/* the following two fields are internal, NEVER access directly */ unsignedint __data_len; /* total data len */
sector_t __sector; /* sector cursor */
struct bio *bio; struct bio *biotail;
union { struct list_head queuelist; struct request *rq_next;
};
struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */
u64 alloc_time_ns; #endif /* Time that this request was allocated for this IO. */
u64 start_time_ns; /* Time that I/O was submitted to the device. */
u64 io_start_time_ns;
#ifdef CONFIG_BLK_WBT unsignedshort wbt_flags; #endif /* * rq sectors used for blk stats. It has the same value * with blk_rq_sectors(rq), except that it never be zeroed * by completion.
*/ unsignedshort stats_sectors;
/* * Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed.
*/ unsignedshort nr_phys_segments; unsignedshort nr_integrity_segments;
/* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used * to queue the request for softirq completion, which is long * after the request has been unhashed (and even removed from * the dispatch list).
*/ union { struct hlist_node hash; /* merge hash */ struct llist_node ipi_list;
};
/* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. special_vec must * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be * insert into an IO scheduler.
*/ union { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec;
};
/* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it.
*/ struct { struct io_cq *icq; void *priv[2];
} elv;
/** * enum blk_eh_timer_return - How the timeout handler should proceed * @BLK_EH_DONE: The block driver completed the command or will complete it at * a later time. * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the * request to complete.
*/ enum blk_eh_timer_return {
BLK_EH_DONE,
BLK_EH_RESET_TIMER,
};
/** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device
*/ struct blk_mq_hw_ctx { struct { /** @lock: Protects the dispatch list. */
spinlock_t lock; /** * @dispatch: Used for requests that are ready to be * dispatched to the hardware but for some reason (e.g. lack of * resources) could not be sent to the hardware. As soon as the * driver can send new requests, requests at this list will * be sent first for a fairer dispatch.
*/ struct list_head dispatch; /** * @state: BLK_MQ_S_* flags. Defines the state of the hw * queue (active, scheduled to restart, stopped).
*/ unsignedlong state;
} ____cacheline_aligned_in_smp;
/** * @run_work: Used for scheduling a hardware queue run at a later time.
*/ struct delayed_work run_work; /** @cpumask: Map of available CPUs where this hctx can run. */
cpumask_var_t cpumask; /** * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU * selection from @cpumask.
*/ int next_cpu; /** * @next_cpu_batch: Counter of how many works left in the batch before * changing to the next CPU.
*/ int next_cpu_batch;
/** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ unsignedlong flags;
/** * @sched_data: Pointer owned by the IO scheduler attached to a request * queue. It's up to the IO scheduler how to use this pointer.
*/ void *sched_data; /** * @queue: Pointer to the request queue that owns this hardware context.
*/ struct request_queue *queue; /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq;
/** * @driver_data: Pointer to data owned by the block driver that created * this hctx
*/ void *driver_data;
/** * @ctx_map: Bitmap for each software queue. If bit is on, there is a * pending request in that software queue.
*/ struct sbitmap ctx_map;
/** * @dispatch_from: Software queue to be used when no scheduler was * selected.
*/ struct blk_mq_ctx *dispatch_from; /** * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to * decide if the hw_queue is busy using Exponential Weighted Moving * Average algorithm.
*/ unsignedint dispatch_busy;
/** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsignedshort type; /** @nr_ctx: Number of software queues. */ unsignedshort nr_ctx; /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs;
/** @dispatch_wait_lock: Lock for dispatch_wait queue. */
spinlock_t dispatch_wait_lock; /** * @dispatch_wait: Waitqueue to put requests when there is no tag * available at the moment, to wait for another try in the future.
*/
wait_queue_entry_t dispatch_wait;
/** * @wait_index: Index of next available dispatch_wait queue to insert * requests.
*/
atomic_t wait_index;
/** * @tags: Tags owned by the block driver. A tag at this set is only * assigned when a request is dispatched from a hardware queue.
*/ struct blk_mq_tags *tags; /** * @sched_tags: Tags owned by I/O scheduler. If there is an I/O * scheduler associated with a request queue, a tag is assigned when * that request is allocated. Else, this member is not used.
*/ struct blk_mq_tags *sched_tags;
/** @numa_node: NUMA node the storage adapter has been connected to. */ unsignedint numa_node; /** @queue_num: Index of this hardware queue. */ unsignedint queue_num;
/** * @nr_active: Number of active requests. Only used when a tag set is * shared across request queues.
*/
atomic_t nr_active;
/** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ struct kobject kobj;
#ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named * as cpu<cpu_number>.
*/ struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif
/** * @hctx_list: if this hctx is not in use, this is an entry in * q->unused_hctx_list.
*/ struct list_head hctx_list;
};
/** * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). * @nr_queues: Number of hardware queues to map CPU IDs onto. * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe * driver to map each hardware queue type (enum hctx_type) onto a distinct * set of hardware queues.
*/ struct blk_mq_queue_map { unsignedint *mq_map; unsignedint nr_queues; unsignedint queue_offset;
};
/** * enum hctx_type - Type of hardware queue * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. * @HCTX_TYPE_READ: Just for READ I/O. * @HCTX_TYPE_POLL: Polled I/O of any kind. * @HCTX_MAX_TYPES: Number of types of hctx.
*/ enum hctx_type {
HCTX_TYPE_DEFAULT,
HCTX_TYPE_READ,
HCTX_TYPE_POLL,
HCTX_MAX_TYPES,
};
/** * struct blk_mq_tag_set - tag set that can be shared between request queues * @ops: Pointers to functions that implement block driver behavior. * @map: One or more ctx -> hctx mappings. One map exists for each * hardware queue type (enum hctx_type) that the driver wishes * to support. There are no restrictions on maps being of the * same size, and it's perfectly legal to share maps between * types. * @nr_maps: Number of elements in the @map array. A number in the range * [1, HCTX_MAX_TYPES]. * @nr_hw_queues: Number of hardware queues supported by the block driver that * owns this data structure. * @queue_depth: Number of tags per hardware queue, reserved tags included. * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag * allocations. * @cmd_size: Number of additional bytes to allocate per request. The block * driver owns these additional bytes. * @numa_node: NUMA node the storage adapter has been connected to. * @timeout: Request processing timeout in jiffies. * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @shared_tags: * Shared set of tags. Has @nr_hw_queues elements. If set, * shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). * @update_nr_hwq_lock: * Synchronize updating nr_hw_queues with add/del disk & * switching elevator.
*/ struct blk_mq_tag_set { conststruct blk_mq_ops *ops; struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsignedint nr_maps; unsignedint nr_hw_queues; unsignedint queue_depth; unsignedint reserved_tags; unsignedint cmd_size; int numa_node; unsignedint timeout; unsignedint flags; void *driver_data;
/** * struct blk_mq_queue_data - Data about a request inserted in a queue * * @rq: Request pointer. * @last: If it is the last request in the queue.
*/ struct blk_mq_queue_data { struct request *rq; bool last;
};
/** * struct blk_mq_ops - Callback functions that implements block driver * behaviour.
*/ struct blk_mq_ops { /** * @queue_rq: Queue a new request from block IO.
*/
blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, conststruct blk_mq_queue_data *);
/** * @commit_rqs: If a driver uses bd->last to judge when to submit * requests to hardware, it must define this function. In case of errors * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done).
*/ void (*commit_rqs)(struct blk_mq_hw_ctx *);
/** * @queue_rqs: Queue a list of new requests. Driver is guaranteed * that each request belongs to the same queue. If the driver doesn't * empty the @rqlist completely, then the rest will be queued * individually by the block layer upon return.
*/ void (*queue_rqs)(struct rq_list *rqlist);
/** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock.
*/ int (*get_budget)(struct request_queue *);
/** * @timeout: Called on request timeout.
*/ enum blk_eh_timer_return (*timeout)(struct request *);
/** * @poll: Called to poll for completion of a specific tag.
*/ int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
/** * @complete: Mark the request as complete.
*/ void (*complete)(struct request *);
/** * @init_hctx: Called when the block layer side of a hardware queue has * been set up, allowing the driver to allocate/init matching * structures.
*/ int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsignedint); /** * @exit_hctx: Ditto for exit/teardown.
*/ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsignedint);
/** * @init_request: Called for every command allocated by the block layer * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request.
*/ int (*init_request)(struct blk_mq_tag_set *set, struct request *, unsignedint, unsignedint); /** * @exit_request: Ditto for exit/teardown.
*/ void (*exit_request)(struct blk_mq_tag_set *set, struct request *, unsignedint);
/** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data.
*/ void (*cleanup_rq)(struct request *);
/** * @busy: If set, returns whether or not this queue currently is busy.
*/ bool (*busy)(struct request_queue *);
/** * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map.
*/ void (*map_queues)(struct blk_mq_tag_set *set);
#ifdef CONFIG_BLK_DEBUG_FS /** * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request.
*/ void (*show_rq)(struct seq_file *m, struct request *rq); #endif
};
/* Keep hctx_flag_name[] in sync with the definitions below */ enum {
BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO:
*/
BLK_MQ_F_STACKING = 1 << 2,
BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
BLK_MQ_F_BLOCKING = 1 << 4,
/* * Alloc tags on a round-robin base instead of the first available one.
*/
BLK_MQ_F_TAG_RR = 1 << 5,
/* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'.
*/
BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6,
enum { /* Keep hctx_state_name[] in sync with the definitions below */
BLK_MQ_S_STOPPED,
BLK_MQ_S_TAG_ACTIVE,
BLK_MQ_S_SCHED_RESTART, /* hw queue is inactive after all its CPUs become offline */
BLK_MQ_S_INACTIVE,
BLK_MQ_S_MAX
};
/* * * Set the state to complete when completing a request from inside ->queue_rq. * This is used by drivers that want to ensure special complete actions that * need access to the request are called on failure, e.g. by nvme for * multipathing.
*/ staticinlinevoid blk_mq_set_request_complete(struct request *rq)
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
}
/* * Complete the request directly instead of deferring it to softirq or * completing it another CPU. Useful in preemptible instead of an interrupt.
*/ staticinlinevoid blk_mq_complete_request_direct(struct request *rq, void (*complete)(struct request *rq))
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
complete(rq);
}
/* * Only need start/end time stamping if we have iostat or * blk stats enabled, or using an IO scheduler.
*/ staticinlinebool blk_mq_need_time_stamp(struct request *rq)
{ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
}
/** * blk_mq_add_to_batch() - add a request to the completion batch * @req: The request to add to batch * @iob: The batch to add the request * @is_error: Specify true if the request failed with an error * @complete: The completaion handler for the request * * Batched completions only work when there is no I/O error and no special * ->end_io handler. * * Return: true when the request was added to the batch, otherwise false
*/ staticinlinebool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, bool is_error, void (*complete)(struct io_comp_batch *))
{ /* * Check various conditions that exclude batch processing: * 1) No batch container * 2) Has scheduler data attached * 3) Not a passthrough request and end_io set * 4) Not a passthrough request and failed with an error
*/ if (!iob) returnfalse; if (req->rq_flags & RQF_SCHED_TAGS) returnfalse; if (!blk_rq_is_passthrough(req)) { if (req->end_io) returnfalse; if (is_error) returnfalse;
}
/** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted * * Return: request * * Driver command data is immediately after the request. So subtract request * size to get back to the original request.
*/ staticinlinestruct request *blk_mq_rq_from_pdu(void *pdu)
{ return pdu - sizeof(struct request);
}
/** * blk_mq_rq_to_pdu - cast a request to a PDU * @rq: the request to be casted * * Return: pointer to the PDU * * Driver command data is immediately after the request. So add request to get * the PDU.
*/ staticinlinevoid *blk_mq_rq_to_pdu(struct request *rq)
{ return rq + 1;
}
/* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats
*/ staticinline sector_t blk_rq_pos(conststruct request *rq)
{ return rq->__sector;
}
/* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to * calculate the data transfer size.
*/ staticinlineunsignedint blk_rq_payload_bytes(struct request *rq)
{ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec.bv_len; return blk_rq_bytes(rq);
}
/* * Return the first full biovec in the request. The caller needs to check that * there are any bvecs before calling this helper.
*/ staticinlinestruct bio_vec req_bvec(struct request *rq)
{ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec; return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
}
/* * Request completion related functions. * * blk_update_request() completes given number of bytes and updates * the request without completing it.
*/ bool blk_update_request(struct request *rq, blk_status_t error, unsignedint nr_bytes); void blk_abort_request(struct request *);
/* * Number of physical segments as sent to the device. * * Normally this is the number of discontiguous data segments sent by the * submitter. But for data-less command like discard we might have no * actual data segments submitted, but the driver might have to add it's * own special payload. In that case we still return 1 here so that this * special payload will be mapped.
*/ staticinlineunsignedshort blk_rq_nr_phys_segments(struct request *rq)
{ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return 1; return rq->nr_phys_segments;
}
/* * Number of discard segments (or ranges) the driver needs to fill in. * Each discard bio merged into a request is counted as one segment.
*/ staticinlineunsignedshort blk_rq_nr_discard_segments(struct request *rq)
{ return max_t(unsignedshort, rq->nr_phys_segments, 1);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.