/* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL \
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to * daemon context. * * It should have been stored to request payload, but we do want * to avoid extra pre-allocation, and uring_cmd payload is always * free for us
*/ union { struct request *req; struct request *req_list;
};
/* * The following two are valid in this cmd whole lifetime, and * setup in ublk uring_cmd handler
*/ struct ublk_queue *ubq;
u16 tag;
};
/* * io command is active: sqe cmd is received, and its cqe isn't done * * If the flag is set, the io command is owned by ublk driver, and waited * for incoming blk-mq request from the ublk block device. * * If the flag is cleared, the io command will be completed, and owned by * ublk server.
*/ #define UBLK_IO_FLAG_ACTIVE 0x01
/* * IO command is completed via cqe, and it is being handled by ublksrv, and * not committed yet * * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for * cross verification
*/ #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
/* * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires * get data buffer address from ublksrv. * * Then, bio data could be copied into this data buffer for a WRITE request * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
*/ #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
/* * request buffer is registered automatically, so we have to unregister it * before completing this request. * * io_uring will unregister buffer automatically for us during exiting.
*/ #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
/* atomic RW with ubq->cancel_lock */ #define UBLK_IO_FLAG_CANCELED 0x80000000
/* * Initialize refcount to a large number to include any registered buffers. * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for * any buffers registered on the io daemon task.
*/ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
struct ublk_io { /* userspace buffer address from io cmd */ union {
__u64 addr; struct ublk_auto_buf_reg buf;
}; unsignedint flags; int res;
union { /* valid if UBLK_IO_FLAG_ACTIVE is set */ struct io_uring_cmd *cmd; /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */ struct request *req;
};
struct task_struct *task;
/* * The number of uses of this I/O by the ublk server * if user copy or zero copy are enabled: * - UBLK_REFCOUNT_INIT from dispatch to the server * until UBLK_IO_COMMIT_AND_FETCH_REQ * - 1 for each inflight ublk_ch_{read,write}_iter() call * - 1 for each io_uring registered buffer not registered on task * The I/O can only be completed once all references are dropped. * User copy and buffer registration operations are only permitted * if the reference count is nonzero.
*/
refcount_t ref; /* Count of buffers registered on task and not yet unregistered */ unsigned task_registered_buffers;
req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) {
ret = PTR_ERR(req); goto out;
}
desc.operation = UBLK_IO_OP_REPORT_ZONES;
desc.sector = sector;
desc.nr_zones = zones_in_request;
ret = ublk_zoned_insert_report_desc(req, &desc); if (ret) goto free_req;
ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL); if (ret) goto erase_desc;
status = blk_execute_rq(req, 0);
ret = blk_status_to_errno(status);
erase_desc:
ublk_zoned_erase_report_desc(req);
free_req:
blk_mq_free_request(req); if (ret) goto out;
for (unsignedint i = 0; i < zones_in_request; i++) { struct blk_zone *zone = buffer + i;
/* A zero length zone means no more zones in this response */ if (!zone->len) break;
static DEFINE_IDR(ublk_index_idr); static DEFINE_SPINLOCK(ublk_idr_lock); static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
static DEFINE_MUTEX(ublk_ctl_mutex);
#define UBLK_MAX_UBLKS UBLK_MINORS
/* * Max unprivileged ublk devices allowed to add * * It can be extended to one per-user limit in future or even controlled * by cgroup.
*/ staticunsignedint unprivileged_ublks_max = 64; staticunsignedint unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
if (p->attrs & UBLK_ATTR_READ_ONLY)
set_disk_ro(ub->ub_disk, true);
set_capacity(ub->ub_disk, p->dev_sectors);
}
staticint ublk_validate_params(conststruct ublk_device *ub)
{ /* basic param is the only one which must be set */ if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { conststruct ublk_param_basic *p = &ub->params.basic;
if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) return -EINVAL;
if (p->logical_bs_shift > p->physical_bs_shift) return -EINVAL;
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL;
if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) return -EINVAL;
} else return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { conststruct ublk_param_discard *p = &ub->params.discard;
/* So far, only support single segment discard */ if (p->max_discard_sectors && p->max_discard_segments != 1) return -EINVAL;
if (!p->discard_granularity) return -EINVAL;
}
/* dev_t is read-only */ if (ub->params.types & UBLK_PARAM_TYPE_DEVT) return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_ZONED) return ublk_dev_param_zoned_validate(ub); elseif (ublk_dev_is_zoned(ub)) return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { conststruct ublk_param_dma_align *p = &ub->params.dma;
if (p->alignment >= PAGE_SIZE) return -EINVAL;
if (!is_power_of_2(p->alignment + 1)) return -EINVAL;
}
if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { conststruct ublk_param_segment *p = &ub->params.seg;
if (!is_power_of_2(p->seg_boundary_mask + 1)) return -EINVAL;
if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) return -EINVAL; if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) return -EINVAL;
}
staticinlinebool ublk_need_req_ref(conststruct ublk_queue *ubq)
{ /* * read()/write() is involved in user copy, so request reference * has to be grabbed * * for zero copy, request buffer need to be registered to io_uring * buffer table, so reference is needed * * For auto buffer register, ublk server still may issue * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up, * so reference is required too.
*/ return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
ublk_support_auto_buf_reg(ubq);
}
/* * Should I/O outstanding to the ublk server when it exits be reissued? * If not, outstanding I/O will get errors.
*/ staticinlinebool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
{ return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
(ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
}
/* * Should I/O issued while there is no ublk server queue? If not, I/O * issued while there is no ublk server will get errors.
*/ staticinlinebool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
{ return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
!(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
}
/* * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy * of the device flags for smaller cache footprint - better for fast * paths.
*/ staticinlinebool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
{ return (ubq->flags & UBLK_F_USER_RECOVERY) &&
!(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
}
/* * Should ublk devices be stopped (i.e. no recovery possible) when the * ublk server exits? If not, devices can be used again by a future * incarnation of a ublk server via the start_recovery/end_recovery * commands.
*/ staticinlinebool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
{ return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
}
/* * If it is one unprivileged device, only owner can open * the disk. Otherwise it could be one trap made by one * evil user who grants this disk's privileges to other * users deliberately. * * This way is reasonable too given anyone can create * unprivileged device, and no need other's grant.
*/ if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { unsignedint curr_uid, curr_gid;
/* * Copy data between request pages and io_iter, and 'offset' * is the start point of linear offset of request.
*/ static size_t ublk_copy_user_pages(conststruct request *req, unsigned offset, struct iov_iter *uiter, int dir)
{ struct ublk_io_iter iter;
size_t done = 0;
if (!ublk_advance_io_iter(req, &iter, offset)) return 0;
while (iov_iter_count(uiter) && iter.bio) { unsigned nr_pages;
ssize_t len;
size_t off; int i;
len = iov_iter_get_pages2(uiter, iter.pages,
iov_iter_count(uiter),
UBLK_MAX_PIN_PAGES, &off); if (len <= 0) return done;
ublk_copy_io_pages(&iter, len, off, dir);
nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); for (i = 0; i < nr_pages; i++) { if (dir == ITER_DEST)
set_page_dirty(iter.pages[i]);
put_page(iter.pages[i]);
}
done += len;
}
/* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current * context is pretty fast, see ublk_pin_user_pages
*/ if (ublk_need_map_req(req)) { struct iov_iter iter; constint dir = ITER_DEST;
/* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ)
io->res = -EIO;
if (io->res < 0) {
res = errno_to_blk_status(io->res); gotoexit;
}
/* * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them * directly. * * Both the two needn't unmap.
*/ if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
req_op(req) != REQ_OP_DRV_IN) gotoexit;
/* for READ request, writing data in iod->addr to rq buffers */
unmapped_bytes = ublk_unmap_io(ubq, req, io);
/* * Extremely impossible since we got data filled in just before * * Re-read simply for this unlikely case.
*/ if (unlikely(unmapped_bytes < io->res))
io->res = unmapped_bytes;
if (blk_update_request(req, BLK_STS_OK, io->res))
blk_mq_requeue_request(req, true); elseif (likely(!blk_should_fake_timeout(req->q)))
__blk_mq_end_request(req, BLK_STS_OK);
return; exit:
blk_mq_end_request(req, res);
}
staticstruct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, struct request *req)
{ /* read cmd first because req will overwrite it */ struct io_uring_cmd *cmd = io->cmd;
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
/* * clear ACTIVE since we are done with this sqe/cmd slot * We can only accept io cmd in case of being not active.
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
/* tell ublksrv one io request is coming */
io_uring_cmd_done(cmd, res, 0, issue_flags);
}
#define UBLK_REQUEUE_DELAY_MS 3
staticinlinevoid __ublk_abort_rq(struct ublk_queue *ubq, struct request *rq)
{ /* We cannot process this rq so just requeue it. */ if (ublk_nosrv_dev_should_queue_io(ubq->dev))
blk_mq_requeue_request(rq, false); else
blk_mq_end_request(rq, BLK_STS_IOERR);
}
/* partially mapped, update io descriptor */ if (unlikely(mapped_bytes != blk_rq_bytes(req))) { /* * Nothing mapped, retry until we succeed. * * We may never succeed in mapping any bytes here because * of OOM. TODO: reserve one buffer with single page pinned * for providing forward progress guarantee.
*/ if (unlikely(!mapped_bytes)) {
blk_mq_requeue_request(req, false);
blk_mq_delay_kick_requeue_list(req->q,
UBLK_REQUEUE_DELAY_MS); returnfalse;
}
/* * Task is exiting if either: * * (1) current != io->task. * io_uring_cmd_complete_in_task() tries to run task_work * in a workqueue if cmd's task is PF_EXITING. * * (2) current->flags & PF_EXITING.
*/ if (unlikely(current != io->task || current->flags & PF_EXITING)) {
__ublk_abort_rq(ubq, req); return;
}
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv * and notify it.
*/
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
__func__, ubq->q_id, req->tag, io->flags);
ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
issue_flags); return;
}
if (!ublk_start_io(ubq, req, io)) return;
if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
}
if (unlikely(READ_ONCE(ubq->fail_io))) return BLK_STS_TARGET;
/* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to * abort all requeued and new rqs here to let del_gendisk() * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() * to avoid UAF on io_uring ctx. * * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced.
*/ if (ublk_nosrv_should_queue_io(ubq) &&
unlikely(READ_ONCE(ubq->force_abort))) return BLK_STS_IOERR;
if (check_cancel && unlikely(ubq->canceling)) return BLK_STS_IOERR;
/* fill iod to slot in io cmd buffer */
res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) return BLK_STS_IOERR;
res = ublk_prep_req(ubq, rq, false); if (res != BLK_STS_OK) return res;
/* * ->canceling has to be handled after ->force_abort and ->fail_io * is dealt with, otherwise this request may not be failed in case * of recovery, and cause hang when deleting disk
*/ if (unlikely(ubq->canceling)) {
__ublk_abort_rq(ubq, rq); return BLK_STS_OK;
}
staticvoid ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
{ int i;
/* All old ioucmds have to be completed */
ubq->nr_io_ready = 0;
for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i];
/* * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch * io->cmd
*/
io->flags &= UBLK_IO_FLAG_CANCELED;
io->cmd = NULL;
io->addr = 0;
/* * old task is PF_EXITING, put it now * * It could be NULL in case of closing one quiesced * device.
*/ if (io->task) {
put_task_struct(io->task);
io->task = NULL;
}
spin_lock(&ub->lock);
disk = ub->ub_disk; if (disk)
get_device(disk_to_dev(disk));
spin_unlock(&ub->lock);
return disk;
}
staticvoid ublk_put_disk(struct gendisk *disk)
{ if (disk)
put_device(disk_to_dev(disk));
}
/* * Use this function to ensure that ->canceling is consistently set for * the device and all queues. Do not set these flags directly. * * Caller must ensure that: * - cancel_mutex is held. This ensures that there is no concurrent * access to ub->canceling and no concurrent writes to ubq->canceling. * - there are no concurrent reads of ubq->canceling from the queue_rq * path. This can be done by quiescing the queue, or through other * means.
*/ staticvoid ublk_set_canceling(struct ublk_device *ub, bool canceling)
__must_hold(&ub->cancel_mutex)
{ int i;
ub->canceling = canceling; for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->canceling = canceling;
}
staticbool ublk_check_and_reset_active_ref(struct ublk_device *ub)
{ int i, j;
if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY |
UBLK_F_AUTO_BUF_REG))) returnfalse;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { struct ublk_queue *ubq = ublk_get_queue(ub, i);
/* * For zero-copy and auto buffer register modes, I/O references * might not be dropped naturally when the daemon is killed, but * io_uring guarantees that registered bvec kernel buffers are * unregistered finally when freeing io_uring context, then the * active references are dropped. * * Wait until active references are dropped for avoiding use-after-free * * registered buffer may be unregistered in io_ring's release hander, * so have to wait by scheduling work function for avoiding the two * file release dependency.
*/ if (ublk_check_and_reset_active_ref(ub)) {
schedule_delayed_work(&ub->exit_work, 1); return;
}
/* * disk isn't attached yet, either device isn't live, or it has * been removed already, so we needn't to do anything
*/
disk = ublk_get_disk(ub); if (!disk) goto out;
/* * All uring_cmd are done now, so abort any request outstanding to * the ublk server * * This can be done in lockless way because ublk server has been * gone * * More importantly, we have to provide forward progress guarantee * without holding ub->mutex, otherwise control task grabbing * ub->mutex triggers deadlock * * All requests may be inflight, so ->canceling may not be set, set * it now.
*/
mutex_lock(&ub->cancel_mutex);
ublk_set_canceling(ub, true); for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_abort_queue(ub, ublk_get_queue(ub, i));
mutex_unlock(&ub->cancel_mutex);
blk_mq_kick_requeue_list(disk->queue);
/* * All infligh requests have been completed or requeued and any new * request will be failed or requeued via `->canceling` now, so it is * fine to grab ub->mutex now.
*/
mutex_lock(&ub->mutex);
/* double check after grabbing lock */ if (!ub->ub_disk) goto unlock;
/* * Transition the device to the nosrv state. What exactly this * means depends on the recovery flags
*/ if (ublk_nosrv_should_stop_dev(ub)) { /* * Allow any pending/future I/O to pass through quickly * with an error. This is needed because del_gendisk * waits for all pending I/O to complete
*/ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
ublk_stop_dev_unlocked(ub);
} else { if (ublk_nosrv_dev_should_queue_io(ub)) { /* ->canceling is set and all requests are aborted */
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
} else {
ub->dev_info.state = UBLK_S_DEV_FAIL_IO; for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
}
}
unlock:
mutex_unlock(&ub->mutex);
ublk_put_disk(disk);
/* all uring_cmd has been done now, reset device & ubq */
ublk_reset_ch_dev(ub);
out:
clear_bit(UB_STATE_OPEN, &ub->state);
/* put the reference grabbed in ublk_ch_release() */
ublk_put_device(ub);
}
/* * Called from ublk char device release handler, when any uring_cmd is * done, meantime request queue is "quiesced" since all inflight requests * can't be completed because ublk server is dead. * * So no one can hold our request IO reference any more, simply ignore the * reference, and complete the request immediately
*/ staticvoid ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
{ int i;
for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i];
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
__ublk_fail_req(ubq, io, io->req);
}
}
mutex_lock(&ub->cancel_mutex); if (ub->canceling) goto out; /* * Now we are serialized with ublk_queue_rq() * * Make sure that ubq->canceling is set when queue is frozen, * because ublk_queue_rq() has to rely on this flag for avoiding to * touch completed uring_cmd
*/
blk_mq_quiesce_queue(disk->queue);
ublk_set_canceling(ub, true);
blk_mq_unquiesce_queue(disk->queue);
out:
mutex_unlock(&ub->cancel_mutex);
ublk_put_disk(disk);
}
/* * Don't try to cancel this command if the request is started for * avoiding race between io_uring_cmd_done() and * io_uring_cmd_complete_in_task(). * * Either the started request will be aborted via __ublk_abort_rq(), * then this uring_cmd is canceled next time, or it will be done in * task work function ublk_dispatch_req() because io_uring guarantees * that ublk_dispatch_req() is always called
*/
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); if (req && blk_mq_request_started(req) && req->tag == tag) return;
if (!done)
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
}
/* * The ublk char device won't be closed when calling cancel fn, so both * ublk device and queue are guaranteed to be live * * Two-stage cancel: * * - make every active uring_cmd done in ->cancel_fn() * * - aborting inflight ublk IO requests in ublk char device release handler, * which depends on 1st stage because device can only be closed iff all * uring_cmd are done * * Do _not_ try to acquire ub->mutex before all inflight requests are * aborted, otherwise deadlock may be caused.
*/ staticvoid ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, unsignedint issue_flags)
{ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; struct ublk_io *io;
if (WARN_ON_ONCE(!ubq)) return;
if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth)) return;
WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); while (true) {
idle = true;
blk_mq_tagset_busy_iter(&ub->tag_set,
ublk_check_inflight_rq, &idle); if (idle) break;
msleep(UBLK_REQUEUE_DELAY_MS);
}
}
staticvoid ublk_force_abort_dev(struct ublk_device *ub)
{ int i;
pr_devel("%s: force abort ub: dev_id %d state %s\n",
__func__, ub->dev_info.dev_id,
ub->dev_info.state == UBLK_S_DEV_LIVE ? "LIVE" : "QUIESCED");
blk_mq_quiesce_queue(ub->ub_disk->queue); if (ub->dev_info.state == UBLK_S_DEV_LIVE)
ublk_wait_tagset_rqs_idle(ub);
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->force_abort = true;
blk_mq_unquiesce_queue(ub->ub_disk->queue); /* We may have requeued some rqs in ublk_quiesce_queue() */
blk_mq_kick_requeue_list(ub->ub_disk->queue);
}
/* reset ublk io_uring queue & io flags */ staticvoid ublk_reset_io_flags(struct ublk_device *ub)
{ int i, j;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { struct ublk_queue *ubq = ublk_get_queue(ub, i);
/* UBLK_IO_FLAG_CANCELED can be cleared now */
spin_lock(&ubq->cancel_lock); for (j = 0; j < ubq->q_depth; j++)
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
spin_unlock(&ubq->cancel_lock);
ubq->fail_io = false;
}
mutex_lock(&ub->cancel_mutex);
ublk_set_canceling(ub, false);
mutex_unlock(&ub->cancel_mutex);
}
/* device can only be started after all IOs are ready */ staticvoid ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
__must_hold(&ub->mutex)
{
ubq->nr_io_ready++; if (ublk_queue_ready(ubq))
ub->nr_queues_ready++; if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
ub->unprivileged_daemons = true;
if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { /* now we are ready for handling ublk io request */
ublk_reset_io_flags(ub);
complete_all(&ub->completion);
}
}
/* * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same * `io_ring_ctx`. * * If this uring_cmd's io_ring_ctx isn't same with the * one for registering the buffer, it is ublk server's * responsibility for unregistering the buffer, otherwise * this ublk request gets stuck.
*/ if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
*buf_idx = io->buf.index;
}
return ublk_set_auto_buf_reg(io, cmd);
}
/* Once we return, `io->req` can't be used any more */ staticinlinestruct request *
ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
{ struct request *req = io->req;
io->cmd = cmd;
io->flags |= UBLK_IO_FLAG_ACTIVE; /* now this cmd slot is owned by ublk driver */
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
/* * Safe to refer to @ubq since ublk_queue won't be died until its * commands are completed
*/
pdu->ubq = ubq;
pdu->tag = tag;
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
/* * task_registered_buffers may be 0 if buffers were registered off task * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
*/ if (current == io->task && io->task_registered_buffers)
io->task_registered_buffers--; else
ublk_put_req_ref(io, rq);
}
/* * Ensure there are still references for ublk_sub_req_ref() to release. * If not, fall back on the thread-safe buffer registration.
*/
new_registered_buffers = io->task_registered_buffers + 1; if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) return -EINVAL;
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags); if (ret) return ret;
staticint ublk_check_fetch_buf(conststruct ublk_queue *ubq, __u64 buf_addr)
{ if (ublk_need_map_io(ubq)) { /* * FETCH_RQ has to provide IO buffer if NEED GET * DATA is not enabled
*/ if (!buf_addr && !ublk_need_get_data(ubq)) return -EINVAL;
} elseif (buf_addr) { /* User copy requires addr to be unset */ return -EINVAL;
} return 0;
}
/* * When handling FETCH command for setting up ublk uring queue, * ub->mutex is the innermost lock, and we won't block for handling * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
*/
mutex_lock(&ub->mutex); /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) {
ret = -EBUSY; goto out;
}
/* allow each command to be FETCHed at most once */ if (io->flags & UBLK_IO_FLAG_ACTIVE) {
ret = -EINVAL; goto out;
}
if (ublk_need_map_io(ubq)) { /* * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO.
*/ if (!buf_addr && (!ublk_need_get_data(ubq) ||
req_op(req) == REQ_OP_READ)) return -EINVAL;
} elseif (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { /* * User copy requires addr to be unset when command is * not zone append
*/ return -EINVAL;
}
staticbool ublk_get_data(conststruct ublk_queue *ubq, struct ublk_io *io, struct request *req)
{ /* * We have handled UBLK_IO_NEED_GET_DATA command, * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just * do the copy work.
*/
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; /* update iod->addr because ublksrv may have passed a new io buffer */
ublk_get_iod(ubq, req->tag)->addr = io->addr;
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
__func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
ub_cmd->result);
ret = ublk_check_cmd_op(cmd_op); if (ret) goto out;
/* * io_buffer_unregister_bvec() doesn't access the ubq or io, * so no need to validate the q_id, tag, or task
*/ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
issue_flags);
ret = -EINVAL; if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out;
ubq = ublk_get_queue(ub, ub_cmd->q_id);
if (tag >= ubq->q_depth) goto out;
io = &ubq->ios[tag]; /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
ret = ublk_check_fetch_buf(ubq, ub_cmd->addr); if (ret) goto out;
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); if (ret) goto out;
if (READ_ONCE(io->task) != current) { /* * ublk_register_io_buf() accesses only the io's refcount, * so can be handled on any task
*/ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
issue_flags);
goto out;
}
/* there is pending io cmd, something must be wrong */ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
ret = -EBUSY; goto out;
}
/* * ensure that the user issues UBLK_IO_NEED_GET_DATA * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
*/ if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out;
switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ:
ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr); if (ret) goto out;
io->res = ub_cmd->result;
req = ublk_fill_io_cmd(io, cmd);
ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx); compl = ublk_need_complete_req(ubq, io);
/* can't touch 'ublk_io' any more */ if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ub_cmd->zone_append_lba; if (compl)
__ublk_complete_rq(req);
if (ret) goto out; break; case UBLK_IO_NEED_GET_DATA: /* * ublk_get_data() may fail and fallback to requeue, so keep * uring_cmd active first and prepare for handling new requeued * request
*/
req = ublk_fill_io_cmd(io, cmd);
ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
WARN_ON_ONCE(ret); if (likely(ublk_get_data(ubq, io, req))) {
__ublk_prep_compl_io_cmd(io, req); return UBLK_IO_RES_OK;
} break; default: goto out;
}
ublk_prep_cancel(cmd, issue_flags, ubq, tag); return -EIOCBQUEUED;
out:
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
__func__, cmd_op, tag, ret, io->flags); return ret;
}
/* * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, * which would overwrite it with io->cmd
*/
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); if (!req) return NULL;
if (!ublk_get_req_ref(io)) return NULL;
if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) goto fail_put;
/* well-implemented server won't run into unlocked */ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb); return -EIOCBQUEUED;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.