staticvoid io_complete_rw(struct kiocb *kiocb, long res); staticvoid io_complete_rw_iopoll(struct kiocb *kiocb, long res);
struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb;
u64 addr;
u32 len;
rwf_t flags;
};
staticbool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask)
{ /* If FMODE_NOWAIT is set for a file, we're golden */ if (req->flags & REQ_F_SUPPORT_NOWAIT) returntrue; /* No FMODE_NOWAIT, if we can poll, check the status */ if (io_file_can_poll(req)) { struct poll_table_struct pt = { ._key = mask };
return vfs_poll(req->file, &pt) & mask;
} /* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */ returnfalse;
}
staticvoid io_req_rw_cleanup(struct io_kiocb *req, unsignedint issue_flags)
{ /* * Disable quick recycling for anything that's gone through io-wq. * In theory, this should be fine to cleanup. However, some read or * write iter handling touches the iovec AFTER having called into the * handler, eg to reexpand or revert. This means we can have: * * task io-wq * issue * punt to io-wq * issue * blkdev_write_iter() * ->ki_complete() * io_complete_rw() * queue tw complete * run tw * req_rw_cleanup * iov_iter_count() <- look at iov_iter again * * which can lead to a UAF. This is only possible for io-wq offload * as the cleanup can run in parallel. As io-wq is not the fast path, * just leave cleanup to the end. * * This is really a bug in the core code that does this, any issue * path should assume that a successful (or -EIOCBQUEUED) return can * mean that the underlying data can be gone at any time. But that * should be fixed seperately, and then this check could be killed.
*/ if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) {
req->flags &= ~REQ_F_NEED_CLEANUP;
io_rw_recycle(req, issue_flags);
}
}
if (io_rw_alloc_async(req)) return -ENOMEM;
io = req->async_data;
rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
io->buf_group = req->buf_index;
ioprio = READ_ONCE(sqe->ioprio); if (ioprio) {
ret = ioprio_check_cap(ioprio); if (ret) return ret;
staticint io_prep_rwv(struct io_kiocb *req, conststruct io_uring_sqe *sqe, int ddir)
{ int ret;
ret = io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; if (!(req->flags & REQ_F_BUFFER_SELECT)) return 0;
/* * Have to do this validation here, as this is in io_read() rw->len * might have chanaged due to buffer selection
*/ return io_iov_buffer_select_prep(req);
}
int io_prep_readv_fixed(struct io_kiocb *req, conststruct io_uring_sqe *sqe)
{ int ret;
ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; return io_rw_prep_reg_vec(req);
}
int io_prep_writev_fixed(struct io_kiocb *req, conststruct io_uring_sqe *sqe)
{ int ret;
ret = __io_prep_rw(req, sqe, ITER_SOURCE); if (unlikely(ret)) return ret; return io_rw_prep_reg_vec(req);
}
/* * Multishot read is prepared just like a normal read/write request, only * difference is that we set the MULTISHOT flag.
*/ int io_read_mshot_prep(struct io_kiocb *req, conststruct io_uring_sqe *sqe)
{ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); int ret;
/* must be used with provided buffers */ if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL;
ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret;
if (!S_ISBLK(mode) && !S_ISREG(mode)) returnfalse; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
!(ctx->flags & IORING_SETUP_IOPOLL))) returnfalse; /* * If ref is dying, we might be running poll reap from the exit work. * Don't attempt to reissue from that path, just let it fail with * -EAGAIN.
*/ if (percpu_ref_is_dying(&ctx->refs)) returnfalse;
/* * Trigger the notifications after having done some IO, and finish the write * accounting, if any.
*/ staticvoid io_req_io_end(struct io_kiocb *req)
{ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
/* add previously done IO, if any */ if (req_has_async_data(req) && io->bytes_done > 0) { if (res < 0)
res = io->bytes_done; else
res += io->bytes_done;
} return res;
}
/* IO was queued async, completion will happen later */ if (ret == -EIOCBQUEUED) return;
/* transform internal restart error codes */ if (unlikely(ret < 0)) { switch (ret) { case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: case -ERESTART_RESTARTBLOCK: /* * We can't just restart the syscall, since previously * submitted sqes may already be in progress. Just fail * this IO with EINTR.
*/
ret = -EINTR; break;
}
}
if (req->ctx->flags & IORING_SETUP_IOPOLL)
io_complete_rw_iopoll(&rw->kiocb, ret); else
io_complete_rw(&rw->kiocb, ret);
}
/* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually.
*/ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
{ struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp;
ssize_t ret = 0;
loff_t *ppos;
/* * Don't support polled IO through this interface, and we can't * support non-blocking either. For the latter, this just causes * the kiocb to be handled from an async context.
*/ if (kiocb->ki_flags & IOCB_HIPRI) return -EOPNOTSUPP; if ((kiocb->ki_flags & IOCB_NOWAIT) &&
!(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) return -EFAULT;
ppos = io_kiocb_ppos(kiocb);
while (iov_iter_count(iter)) { void __user *addr;
size_t len;
ssize_t nr;
if (iter_is_ubuf(iter)) {
addr = iter->ubuf + iter->iov_offset;
len = iov_iter_count(iter);
} elseif (!iov_iter_is_bvec(iter)) {
addr = iter_iov_addr(iter);
len = iter_iov_len(iter);
} else {
addr = u64_to_user_ptr(rw->addr);
len = rw->len;
}
if (ddir == READ)
nr = file->f_op->read(file, addr, len, ppos); else
nr = file->f_op->write(file, addr, len, ppos);
if (nr < 0) { if (!ret)
ret = nr; break;
}
ret += nr; if (!iov_iter_is_bvec(iter)) {
iov_iter_advance(iter, nr);
} else {
rw->addr += nr;
rw->len -= nr; if (!rw->len) break;
} if (nr != len) break;
}
return ret;
}
/* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. * This gets called when the page is unlocked, and we generally expect that to * happen when the page IO is completed and the page is now uptodate. This will * queue a task_work based retry of the operation, attempting to copy the data * again. If the latter fails because the page was NOT uptodate, then we will * do a thread based blocking retry of the operation. That's the unexpected * slow path.
*/ staticint io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, int sync, void *arg)
{ struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct wait_page_key *key = arg;
/* * This controls whether a given IO request should be armed for async page * based retry. If we return false here, the request is handed to the async * worker threads for retry. If we're doing buffered reads on a regular file, * we prepare a private wait_page_queue entry and retry the operation. This * will either succeed because the page is now uptodate and unlocked, or it * will register a callback when the page is unlocked at IO completion. Through * that callback, io_uring uses task_work to setup a retry of the operation. * That retry will attempt the buffered read again. The retry will generally * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry.
*/ staticbool io_rw_should_retry(struct io_kiocb *req)
{ struct io_async_rw *io = req->async_data; struct wait_page_queue *wait = &io->wpq; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb;
/* * Never retry for NOWAIT or a request with metadata, we just complete * with -EAGAIN.
*/ if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) returnfalse;
/* Only for buffered IO */ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) returnfalse;
/* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks
*/ if (io_file_can_poll(req) ||
!(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC)) returnfalse;
if (unlikely(!(file->f_mode & mode))) return -EBADF;
if (!(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(file);
kiocb->ki_flags = file->f_iocb_flags;
ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret;
kiocb->ki_flags |= IOCB_ALLOC_CACHE;
/* * If the file is marked O_NONBLOCK, still allow retry for it if it * supports async. Otherwise it's impossible to use O_NONBLOCK files * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
*/ if (kiocb->ki_flags & IOCB_NOWAIT ||
((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT))))
req->flags |= REQ_F_NOWAIT;
if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP;
kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI;
req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { /* make sure every req only blocks once*/
req->flags &= ~REQ_F_IOPOLL_STATE;
req->iopoll_start = ktime_get_ns();
}
} else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL;
}
if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data;
if (!(file->f_mode & FMODE_HAS_METADATA)) return -EINVAL;
/* * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here.
*/ if (!(req->file->f_flags & O_DIRECT)) return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HAS_METADATA;
kiocb->private = &io->meta;
}
if (req->flags & REQ_F_IMPORT_BUFFER) {
ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); if (unlikely(ret)) return ret;
} elseif (io_do_buffer_select(req)) {
ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret;
}
ret = io_rw_init_file(req, FMODE_READ, READ); if (unlikely(ret)) return ret;
req->cqe.res = iov_iter_count(&io->iter);
if (force_nonblock) { /* If the file doesn't support async, just async punt */ if (unlikely(!io_file_supports_nowait(req, EPOLLIN))) return -EAGAIN;
kiocb->ki_flags |= IOCB_NOWAIT;
} else { /* Ensure we clear previously set non-block flag */
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
ppos = io_kiocb_update_pos(req);
ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); if (unlikely(ret)) return ret;
ret = io_iter_do_read(rw, &io->iter);
/* * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT * issue, even though they should be returning -EAGAIN. To be safe, * retry from blocking context for either.
*/ if (ret == -EOPNOTSUPP && force_nonblock)
ret = -EAGAIN;
if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) goto done;
ret = 0;
} elseif (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE;
} elseif (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
(issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ goto done;
}
/* * Don't depend on the iter state matching what was consumed, or being * untouched in case of error. Restore it and we'll advance it * manually if we need to.
*/
iov_iter_restore(&io->iter, &io->iter_state);
io_meta_restore(io, kiocb);
do { /* * We end up here because of a partial read, either from * above or inside this loop. Advance the iter by the bytes * that were consumed.
*/
iov_iter_advance(&io->iter, ret); if (!iov_iter_count(&io->iter)) break;
io->bytes_done += ret;
iov_iter_save_state(&io->iter, &io->iter_state);
/* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN;
}
req->cqe.res = iov_iter_count(&io->iter); /* * Now retry read with the IOCB_WAITQ parts set in the iocb. If * we get -EIOCBQUEUED, then we'll get a notification when the * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset.
*/
ret = io_iter_do_read(rw, &io->iter); if (ret == -EIOCBQUEUED) return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
iov_iter_restore(&io->iter, &io->iter_state);
} while (ret > 0);
done: /* it's faster to check here then delegate to kfree */ return ret;
}
int io_read(struct io_kiocb *req, unsignedint issue_flags)
{ int ret;
ret = __io_read(req, issue_flags); if (ret >= 0) return kiocb_done(req, ret, issue_flags);
return ret;
}
int io_read_mshot(struct io_kiocb *req, unsignedint issue_flags)
{ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsignedint cflags = 0; int ret;
/* * Multishot MUST be used on a pollable file
*/ if (!io_file_can_poll(req)) return -EBADFD;
/* make it sync, multishot doesn't support async execution */
rw->kiocb.ki_complete = NULL;
ret = __io_read(req, issue_flags);
/* * If we get -EAGAIN, recycle our buffer and just let normal poll * handling arm it.
*/ if (ret == -EAGAIN) { /* * Reset rw->len to 0 again to avoid clamping future mshot * reads, in case the buffer size varies.
*/ if (io_kbuf_recycle(req, issue_flags))
rw->len = 0; return IOU_RETRY;
} elseif (ret <= 0) {
io_kbuf_recycle(req, issue_flags); if (ret < 0)
req_set_fail(req);
} elseif (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
cflags = io_put_kbuf(req, ret, issue_flags);
} else { /* * Any successful return value will keep the multishot read * armed, if it's still set. Put our buffer and post a CQE. If * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done.
*/
cflags = io_put_kbuf(req, ret, issue_flags);
rw->len = 0; /* similarly to above, reset len to 0 */
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (issue_flags & IO_URING_F_MULTISHOT) /* * Force retry, as we might have more data to * be read and otherwise it won't get retried * until (if ever) another poll is triggered.
*/
io_poll_multishot_retry(req);
return IOU_RETRY;
}
}
/* * Either an error, or we've hit overflow posting the CQE. For any * multishot request, hitting overflow will terminate it.
*/
io_req_set_res(req, ret, cflags);
io_req_rw_cleanup(req, issue_flags); return IOU_COMPLETE;
}
if (req->flags & REQ_F_IMPORT_BUFFER) {
ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); if (unlikely(ret)) return ret;
}
ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret;
req->cqe.res = iov_iter_count(&io->iter);
if (force_nonblock) { /* If the file doesn't support async, just async punt */ if (unlikely(!io_file_supports_nowait(req, EPOLLOUT))) goto ret_eagain;
/* Check if we can support NOWAIT. */ if (!(kiocb->ki_flags & IOCB_DIRECT) &&
!(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) &&
(req->flags & REQ_F_ISREG)) goto ret_eagain;
kiocb->ki_flags |= IOCB_NOWAIT;
} else { /* Ensure we clear previously set non-block flag */
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
ppos = io_kiocb_update_pos(req);
ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); if (unlikely(ret)) return ret;
if (unlikely(!io_kiocb_start_write(req, kiocb))) return -EAGAIN;
kiocb->ki_flags |= IOCB_WRITE;
/* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT.
*/ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
ret2 = -EAGAIN; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto ret_eagain;
/* This is a partial write. The file pos has already been * updated, setup the async struct to complete the request * in the worker. Also update bytes_done to account for * the bytes already written.
*/
iov_iter_save_state(&io->iter, &io->iter_state);
io->bytes_done += ret2;
/* * Use minimum sleep time if we're polling devices with different * latencies. We could get more completions from the faster ones.
*/ if (ctx->hybrid_poll_time > runtime)
ctx->hybrid_poll_time = runtime;
/* * Only spin for completions if we don't have multiple devices hanging * off our complete list.
*/ if (ctx->poll_multi_queue || force_nonspin)
poll_flags |= BLK_POLL_ONESHOT;
/* * Move completed and retryable entries to our local lists. * If we find a request that requires polling, break out * and complete those lists first, if we have entries there.
*/ if (READ_ONCE(req->iopoll_completed)) break;
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
ret = io_uring_hybrid_poll(req, &iob, poll_flags); else
ret = io_uring_classic_poll(req, &iob, poll_flags);
/* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) break;
nr_events++;
req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); if (req->opcode != IORING_OP_URING_CMD)
io_req_rw_cleanup(req, 0);
} if (unlikely(!nr_events)) return 0;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.