if (ctx->mm_account)
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
int io_account_mem(struct io_ring_ctx *ctx, unsignedlong nr_pages)
{ int ret;
if (ctx->user) {
ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret;
}
if (ctx->mm_account)
atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
return 0;
}
int io_validate_user_buf_range(u64 uaddr, u64 ulen)
{ unsignedlong tmp, base = (unsignedlong)uaddr; unsignedlong acct_len = (unsignedlong)PAGE_ALIGN(ulen);
/* arbitrary limit, but we need something */ if (ulen > SZ_1G || !ulen) return -EFAULT; if (check_add_overflow(base, acct_len, &tmp)) return -EOVERFLOW; return 0;
}
staticint io_buffer_validate(struct iovec *iov)
{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong.
*/ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0;
if (ctx->file_table.data.nr) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM;
for (i = 0; i < nr_args; i++) { struct io_rsrc_node *node;
u64 tag = 0;
ret = -EFAULT; if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) goto fail; if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) goto fail; /* allow sparse sets */ if (!fds || fd == -1) {
ret = -EINVAL; if (tag) goto fail; continue;
}
file = fget(fd);
ret = -EBADF; if (unlikely(!file)) goto fail;
/* * Don't allow io_uring instances to be registered.
*/ if (io_is_uring_fops(file)) {
fput(file); goto fail;
}
ret = -ENOMEM;
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) {
fput(file); goto fail;
} if (tag)
node->tag = tag;
ctx->file_table.data.nodes[i] = node;
io_fixed_file_set(node, file);
io_file_bitmap_set(&ctx->file_table, i);
}
/* default it to the whole table */
io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0;
fail:
io_clear_table_tags(&ctx->file_table.data);
io_sqe_files_unregister(ctx); return ret;
}
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{ if (!ctx->buf_table.nr) return -ENXIO;
io_rsrc_data_free(ctx, &ctx->buf_table); return 0;
}
/* * Not super efficient, but this is just a registration time. And we do cache * the last compound head, so generally we'll only do a full search if we don't * match that one. * * We check if the given compound head page has already been accounted, to * avoid double accounting it. This allows us to account the full size of the * page, not just the constituent pages of a huge page.
*/ staticbool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct page *hpage)
{ int i, j;
/* check current page array */ for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) continue; if (compound_head(pages[i]) == hpage) returntrue;
}
/* check previously registered pages */ for (i = 0; i < ctx->buf_table.nr; i++) { struct io_rsrc_node *node = ctx->buf_table.nodes[i]; struct io_mapped_ubuf *imu;
if (!node) continue;
imu = node->buf; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; if (compound_head(imu->bvec[j].bv_page) == hpage) returntrue;
}
}
returnfalse;
}
staticint io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct io_mapped_ubuf *imu, struct page **last_hpage)
{ int i, ret;
imu->acct_pages = 0; for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) {
imu->acct_pages++;
} else { struct page *hpage;
hpage = compound_head(pages[i]); if (hpage == *last_hpage) continue;
*last_hpage = hpage; if (headpage_already_acct(ctx, pages, i, hpage)) continue;
imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
}
}
if (!imu->acct_pages) return 0;
ret = io_account_mem(ctx, imu->acct_pages); if (ret)
imu->acct_pages = 0; return ret;
}
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) returnfalse;
for (i = 0, j = 0; i < nr_folios; i++) { struct page *p = compound_head(page_array[j]); struct folio *folio = page_folio(p); unsignedint nr;
WARN_ON_ONCE(i > 0 && p != page_array[j]);
nr = i ? data->nr_pages_mid : data->nr_pages_head;
nr = min(nr, nr_pages_left); /* Drop all but one ref, the entire folio will remain pinned. */ if (nr > 1)
unpin_user_folio(folio, nr - 1);
j += nr;
nr_pages_left -= nr;
new_array[i] = p;
}
/* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail.
*/ for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio &&
page_array[i] == page_array[i-1] + 1) {
count++; continue;
}
if (nr_folios == 1) { if (folio_page_idx(folio, page_array[i-1]) !=
data->nr_pages_mid - 1) returnfalse;
if (ctx->buf_table.nr) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL;
ret = io_rsrc_data_alloc(&data, nr_args); if (ret) return ret;
if (!arg)
memset(iov, 0, sizeof(*iov));
for (i = 0; i < nr_args; i++) { struct io_rsrc_node *node;
u64 tag = 0;
if (arg) {
uvec = (struct iovec __user *) arg;
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) {
ret = PTR_ERR(iov); break;
}
ret = io_buffer_validate(iov); if (ret) break; if (ctx->compat)
arg += sizeof(struct compat_iovec); else
arg += sizeof(struct iovec);
}
if (tags) { if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
ret = -EFAULT; break;
}
}
node = io_sqe_buffer_register(ctx, iov, &last_hpage); if (IS_ERR(node)) {
ret = PTR_ERR(node); break;
} if (tag) { if (!node) {
ret = -EINVAL; break;
}
node->tag = tag;
}
data.nodes[i] = node;
}
io_ring_submit_lock(ctx, issue_flags); if (index >= data->nr) {
ret = -EINVAL; goto unlock;
}
index = array_index_nospec(index, data->nr);
if (data->nodes[index]) {
ret = -EBUSY; goto unlock;
}
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) {
ret = -ENOMEM; goto unlock;
}
/* * blk_rq_nr_phys_segments() may overestimate the number of bvecs * but avoids needing to iterate over the bvecs
*/
imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); if (!imu) {
kfree(node);
ret = -ENOMEM; goto unlock;
}
ret = validate_fixed_range(buf_addr, len, imu); if (unlikely(ret)) return ret; if (!(imu->dir & (1 << ddir))) return -EFAULT;
offset = buf_addr - imu->ubuf;
if (imu->is_kbuf) return io_import_kbuf(ddir, iter, imu, len, offset);
/* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates * over each segment manually. We can cheat a bit here for user * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the * first and last bvec
*/
folio_mask = (1UL << imu->folio_shift) - 1;
bvec = imu->bvec; if (offset >= bvec->bv_len) { unsignedlong seg_skip;
/* Lock two rings at once. The rings must be different! */ staticvoid lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
{ if (ctx1 > ctx2)
swap(ctx1, ctx2);
mutex_lock(&ctx1->uring_lock);
mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
}
/* Both rings are locked by the caller. */ staticint io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, struct io_uring_clone_buffers *arg)
{ struct io_rsrc_data data; int i, ret, off, nr; unsignedint nbufs;
/* * Accounting state is shared between the two rings; that only works if * both rings are accounted towards the same counters.
*/ if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) return -EINVAL;
/* if offsets are given, must have nr specified too */ if (!arg->nr && (arg->dst_off || arg->src_off)) return -EINVAL; /* not allowed unless REPLACE is set */ if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) return -EBUSY;
ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); if (ret) return ret;
/* Fill entries in data from dst that won't overlap with src */ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
if (src_node) {
data.nodes[i] = src_node;
src_node->refs++;
}
}
ret = -ENXIO;
nbufs = src_ctx->buf_table.nr; if (!nbufs) goto out_free;
ret = -EINVAL; if (!arg->nr)
arg->nr = nbufs; elseif (arg->nr > nbufs) goto out_free;
ret = -EOVERFLOW; if (check_add_overflow(arg->nr, arg->src_off, &off)) goto out_free; if (off > nbufs) goto out_free;
off = arg->dst_off;
i = arg->src_off;
nr = arg->nr; while (nr--) { struct io_rsrc_node *dst_node, *src_node;
src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); if (!src_node) {
dst_node = NULL;
} else {
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) {
ret = -ENOMEM; goto out_free;
}
/* * If asked for replace, put the old table. data->nodes[] holds both * old and new nodes at this point.
*/ if (arg->flags & IORING_REGISTER_DST_REPLACE)
io_rsrc_data_free(ctx, &ctx->buf_table);
/* * ctx->buf_table must be empty now - either the contents are being * replaced and we just freed the table, or the contents are being * copied to a ring that does not have buffers yet (checked at function * entry).
*/
WARN_ON_ONCE(ctx->buf_table.nr);
ctx->buf_table = data; return 0;
/* * Copy the registered buffers from the source ring whose file descriptor * is given in the src_fd to the current ring. This is identical to registering * the buffers with ctx, except faster as mappings already exist. * * Since the memory is already accounted once, don't account it again.
*/ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
{ struct io_uring_clone_buffers buf; struct io_ring_ctx *src_ctx; bool registered_src; struct file *file; int ret;
if (copy_from_user(&buf, arg, sizeof(buf))) return -EFAULT; if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) return -EINVAL; if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) return -EBUSY; if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) return -EINVAL;
ret = validate_fixed_range(buf_addr, iov_len, imu); if (unlikely(ret)) return ret;
if (unlikely(!iov_len)) return -EFAULT; if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) return -EOVERFLOW;
offset = buf_addr - imu->ubuf; /* * Only the first bvec can have non zero bv_offset, account it * here and work with full folios below.
*/
offset += imu->bvec[0].bv_offset;
staticint iov_kern_bvec_size(conststruct iovec *iov, conststruct io_mapped_ubuf *imu, unsignedint *nr_seg)
{
size_t offset = (size_t)(uintptr_t)iov->iov_base; conststruct bio_vec *bvec = imu->bvec; int start = 0, i = 0;
size_t off = 0; int ret;
ret = validate_fixed_range(offset, iov->iov_len, imu); if (unlikely(ret)) return ret;
for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
off += bvec[i].bv_len, i++) { if (offset >= off && offset < off + bvec[i].bv_len)
start = i;
}
*nr_seg = i - start; return 0;
}
if (uvec_segs > iv->nr) {
ret = io_vec_realloc(iv, uvec_segs); if (ret) return ret;
req->flags |= REQ_F_NEED_CLEANUP;
}
/* pad iovec to the right */
iovec_off = iv->nr - uvec_segs;
iov = iv->iovec + iovec_off;
res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
io_is_compat(req->ctx)); if (IS_ERR(res)) return PTR_ERR(res);
req->flags |= REQ_F_IMPORT_BUFFER; return 0;
}
¤ Dauer der Verarbeitung: 0.19 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.