/* * An async IO implementation for Linux * Written by Benjamin LaHaise <bcrl@kvack.org> * * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms.
*/ #define pr_fmt(fmt) "%s: " fmt, __func__
#define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock
* mutex by aio_read_events_ring(). */ unsigned tail;
/* * Plugging is meant to work with larger batches of IOs. If we don't * have more than the below, then don't bother setting up a plug.
*/ #define AIO_PLUG_THRESHOLD 2
/* * For percpu reqs_available, number of slots we move to/from global * counter at a time:
*/ unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. * * The real limit is nr_events - 1, which will be larger (see * aio_setup_ring())
*/ unsigned max_reqs;
/* Size of ringbuffer, in units of struct io_event */ unsigned nr_events;
unsignedlong mmap_base; unsignedlong mmap_size;
struct folio **ring_folios; long nr_pages;
struct rcu_work free_rwork; /* see free_ioctx() */
/* * signals when all in-flight requests are done
*/ struct ctx_rq_wait *rq_wait;
struct { /* * This counts the number of available slots in the ringbuffer, * so we avoid overflowing it: it's decremented (if positive) * when allocating a kiocb and incremented when the resulting * io_event is pulled off the ringbuffer. * * We batch accesses to it with a percpu version.
*/
atomic_t reqs_available;
} ____cacheline_aligned_in_smp;
struct {
spinlock_t ctx_lock; struct list_head active_reqs; /* used for cancellation */
} ____cacheline_aligned_in_smp;
/* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h>
*/ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; struct cred *creds;
};
/* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, * or directly as just 'ki_filp' in this struct.
*/ struct aio_kiocb { union { struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll;
};
struct list_head ki_list; /* the aio core uses this
* for cancellation */
refcount_t ki_refcnt;
/* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to.
*/ struct eventfd_ctx *ki_eventfd;
};
/*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); staticunsignedlong aio_nr; /* current system wide number of aio requests */ staticunsignedlong aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ #ifdef CONFIG_SYSCTL staticconststruct ctl_table aio_sysctls[] = {
{
.procname = "aio-nr",
.data = &aio_nr,
.maxlen = sizeof(aio_nr),
.mode = 0444,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "aio-max-nr",
.data = &aio_max_nr,
.maxlen = sizeof(aio_max_nr),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
};
staticvoid __init aio_sysctl_init(void)
{
register_sysctl_init("fs", aio_sysctls);
} #else #define aio_sysctl_init() do { } while (0) #endif
/* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence.
*/ staticint __init aio_setup(void)
{ staticstruct file_system_type aio_fs = {
.name = "aio",
.init_fs_context = aio_init_fs_context,
.kill_sb = kill_anon_super,
};
aio_mnt = kern_mount(&aio_fs); if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
/* mapping->i_private_lock here protects against the kioctx teardown. */
spin_lock(&mapping->i_private_lock);
ctx = mapping->i_private_data; if (!ctx) {
rc = -EINVAL; goto out;
}
/* The ring_lock mutex. The prevents aio_read_events() from writing * to the ring's head, and prevents page migration from mucking in * a partially initialized kiotx.
*/ if (!mutex_trylock(&ctx->ring_lock)) {
rc = -EAGAIN; goto out;
}
idx = src->index; if (idx < (pgoff_t)ctx->nr_pages) { /* Make sure the old folio hasn't already been changed */ if (ctx->ring_folios[idx] != src)
rc = -EAGAIN;
} else
rc = -EINVAL;
if (rc != 0) goto out_unlock;
/* Writeback must be complete */
BUG_ON(folio_test_writeback(src));
folio_get(dst);
/* Take completion_lock to prevent other writes to the ring buffer * while the old folio is copied to the new. This prevents new * events from being lost.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
folio_copy(dst, src);
folio_migrate_flags(dst, src);
BUG_ON(ctx->ring_folios[idx] != src);
ctx->ring_folios[idx] = dst;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
/* The old folio is no longer accessible. */
folio_put(src);
/* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call * aio_free_ring(). Use rcu_work.
*/ staticvoid free_ioctx(struct work_struct *work)
{ struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
free_rwork);
pr_debug("freeing %p\n", ctx);
/* At this point we know that there are no any in-flight requests */ if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
complete(&ctx->rq_wait->comp);
/* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be.
*/ staticvoid free_ioctx_users(struct percpu_ref *ref)
{ struct kioctx *ctx = container_of(ref, struct kioctx, users); struct aio_kiocb *req;
while (1) { if (table) for (i = 0; i < table->nr; i++) if (!rcu_access_pointer(table->table[i])) {
ctx->id = i;
rcu_assign_pointer(table->table[i], ctx);
spin_unlock(&mm->ioctx_lock);
/* While kioctx setup is in progress, * we are protected from page migration * changes ring_folios by ->ring_lock.
*/
ring = folio_address(ctx->ring_folios[0]);
ring->id = ctx->id; return 0;
}
staticvoid aio_nr_sub(unsigned nr)
{
spin_lock(&aio_nr_lock); if (WARN_ON(aio_nr - nr > aio_nr))
aio_nr = 0; else
aio_nr -= nr;
spin_unlock(&aio_nr_lock);
}
/* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
*/ staticstruct kioctx *ioctx_alloc(unsigned nr_events)
{ struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM;
/* * Store the original nr_events -- what userspace passed to io_setup(), * for counting against the global limit -- before it changes.
*/ unsignedint max_reqs = nr_events;
/* * We keep track of the number of available ringbuffer slots, to prevent * overflow (reqs_available), and we also use percpu counters for this. * * So since up to half the slots might be on other cpu's percpu counters * and unavailable, double nr_events so userspace sees what they * expected: additionally, we move req_batch slots to/from percpu * counters at a time, so make sure that isn't 0:
*/
nr_events = max(nr_events, num_possible_cpus() * 4);
nr_events *= 2;
/* Prevent overflows */ if (nr_events > (0x10000000U / sizeof(struct io_event))) {
pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL);
}
if (!nr_events || (unsignedlong)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN);
ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM);
ctx->max_reqs = max_reqs;
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->completion_lock);
mutex_init(&ctx->ring_lock); /* Protect against page migration throughout kiotx setup by keeping
* the ring_lock mutex held until setup is complete. */
mutex_lock(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err;
if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err;
ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err;
err = aio_setup_ring(ctx, nr_events); if (err < 0) goto err;
/* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx.
*/ staticint kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, struct ctx_rq_wait *wait)
{ struct kioctx_table *table;
spin_lock(&mm->ioctx_lock); if (atomic_xchg(&ctx->dead, 1)) {
spin_unlock(&mm->ioctx_lock); return -EINVAL;
}
/* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(&ctx->wait);
/* * It'd be more correct to do this in free_ioctx(), after all * the outstanding kiocbs have finished - but by then io_destroy * has already returned, so io_setup() could potentially return * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell).
*/
aio_nr_sub(ctx->max_reqs);
if (ctx->mmap_size)
vm_munmap(ctx->mmap_base, ctx->mmap_size);
/* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be * called on the context. * * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on * them.
*/ void exit_aio(struct mm_struct *mm)
{ struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); struct ctx_rq_wait wait; int i, skipped;
skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx =
rcu_dereference_protected(table->table[i], true);
if (!ctx) {
skipped++; continue;
}
/* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, * this is not necessarily our ->mm. * Since kill_ioctx() uses non-zero ->mmap_size as indicator * that it needs to unmap the area, just set it to 0.
*/
ctx->mmap_size = 0;
kill_ioctx(mm, ctx, &wait);
}
if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */
wait_for_completion(&wait.comp);
}
local_irq_save(flags);
kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int avail = atomic_read(&ctx->reqs_available);
do { if (avail < ctx->req_batch) goto out;
} while (!atomic_try_cmpxchg(&ctx->reqs_available,
&avail, avail - ctx->req_batch));
kcpu->reqs_available += ctx->req_batch;
}
ret = true;
kcpu->reqs_available--;
out:
local_irq_restore(flags); return ret;
}
/* refill_reqs_available * Updates the reqs_available reference counts used for tracking the * number of free slots in the completion ring. This can be called * from aio_complete() (to optimistically update reqs_available) or * from aio_get_req() (the we're out of events case). It must be * called holding ctx->completion_lock.
*/ staticvoid refill_reqs_available(struct kioctx *ctx, unsigned head, unsigned tail)
{ unsigned events_in_ring, completed;
/* Clamp head since userland can write to it. */
head %= ctx->nr_events; if (head <= tail)
events_in_ring = tail - head; else
events_in_ring = ctx->nr_events - (head - tail);
/* user_refill_reqs_available * Called to refill reqs_available when aio_get_req() encounters an * out of space in the completion ring.
*/ staticvoid user_refill_reqs_available(struct kioctx *ctx)
{
spin_lock_irq(&ctx->completion_lock); if (ctx->completed_events) { struct aio_ring *ring; unsigned head;
/* Access of ring->head may race with aio_read_events_ring() * here, but that's okay since whether we read the old version * or the new version, and either will be valid. The important * part is that head cannot pass tail since we prevent * aio_complete() from updating tail by holding * ctx->completion_lock. Even if head is invalid, the check * against ctx->completed_events below will make sure we do the * safe/right thing.
*/
ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
/* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. * * The refcount is initialized to 2 - one for the async op completion, * one for the synchronous code that does this.
*/ staticinlinestruct aio_kiocb *aio_get_req(struct kioctx *ctx)
{ struct aio_kiocb *req;
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) return NULL;
if (unlikely(!get_reqs_available(ctx))) {
kmem_cache_free(kiocb_cachep, req); return NULL;
}
/* aio_complete * Called when the io request on the given iocb is complete.
*/ staticvoid aio_complete(struct aio_kiocb *iocb)
{ struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head, avail; unsignedlong flags;
/* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
/* after flagging the request as done, we * must never even look at it again
*/
smp_wmb(); /* make event visible before updating tail */
ctx->tail = tail;
ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
ring->tail = tail;
flush_dcache_folio(ctx->ring_folios[0]);
ctx->completed_events++; if (ctx->completed_events > 1)
refill_reqs_available(ctx, head, tail);
avail = tail > head
? tail - head
: tail + ctx->nr_events - head;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
pr_debug("added to ring %p at [%u]\n", iocb, tail);
/* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context.
*/ if (iocb->ki_eventfd)
eventfd_signal(iocb->ki_eventfd);
/* * We have to order our ring_info tail store above and test * of the wait list below outside the wait lock. This is * like in wake_up_bit() where clearing a bit has to be * ordered with the unlocked test.
*/
smp_mb();
if (waitqueue_active(&ctx->wait)) { struct aio_waiter *curr, *next; unsignedlong flags;
/* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched
*/ staticlong aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr)
{ struct aio_ring *ring; unsigned head, tail, pos; long ret = 0; int copy_ret;
/* * The mutex can block and wake us up and that will cause * wait_event_interruptible_hrtimeout() to schedule without sleeping * and repeat. This should be rare enough that it doesn't cause * peformance issues. See the comment in read_events() for more detail.
*/
sched_annotate_sleep();
mutex_lock(&ctx->ring_lock);
/* Access to ->ring_folios here is protected by ctx->ring_lock. */
ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
tail = ring->tail;
/* * Ensure that once we've read the current tail pointer, that * we also see the events that were stored up to the tail.
*/
smp_rmb();
staticbool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i)
{ long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
if (ret > 0)
*i += ret;
if (unlikely(atomic_read(&ctx->dead)))
ret = -EINVAL;
if (!*i)
*i = ret;
return ret < 0 || *i >= min_nr;
}
staticlong read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event,
ktime_t until)
{ struct hrtimer_sleeper t; struct aio_waiter w; long ret = 0, ret2 = 0;
/* * Note that aio_read_events() is being called as the conditional - i.e. * we're calling it after prepare_to_wait() has set task state to * TASK_INTERRUPTIBLE. * * But aio_read_events() can block, and if it blocks it's going to flip * the task state back to TASK_RUNNING. * * This should be ok, provided it doesn't flip the state back to * TASK_RUNNING and return 0 too much - that causes us to spin. That * will only happen if the mutex_lock() call blocks, and we then find * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code.
*/
aio_read_events(ctx, min_nr, nr, event, &ret); if (until == 0 || ret < 0 || ret >= min_nr) return ret;
/* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and * must be initialized to 0 prior to the call. On successful * creation of the aio_context, *ctxp is filled in with the resulting * handle. May fail with -EINVAL if *ctxp is not initialized, * if the specified nr_events exceeds internal limits. May fail * with -EAGAIN if the specified nr_events exceeds the user's limit * of available events. May fail with -ENOMEM if insufficient kernel * resources are available. May fail with -EFAULT if an invalid * pointer is passed for ctxp. Will fail with -ENOSYS if not * implemented.
*/
SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
{ struct kioctx *ioctx = NULL; unsignedlong ctx; long ret;
ret = get_user(ctx, ctxp); if (unlikely(ret)) goto out;
ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) {
pr_debug("EINVAL: ctx %lu nr_events %u\n",
ctx, nr_events); goto out;
}
ioctx = ioctx_alloc(nr_events);
ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp); if (ret)
kill_ioctx(current->mm, ioctx, NULL);
percpu_ref_put(&ioctx->users);
}
ret = get_user(ctx, ctx32p); if (unlikely(ret)) goto out;
ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) {
pr_debug("EINVAL: ctx %lu nr_events %u\n",
ctx, nr_events); goto out;
}
ioctx = ioctx_alloc(nr_events);
ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { /* truncating is ok because it's a user address */
ret = put_user((u32)ioctx->user_id, ctx32p); if (ret)
kill_ioctx(current->mm, ioctx, NULL);
percpu_ref_put(&ioctx->users);
}
out: return ret;
} #endif
/* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not * implemented. May fail with -EINVAL if the context pointed to * is invalid.
*/
SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{ struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { struct ctx_rq_wait wait; int ret;
/* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously.
*/
ret = kill_ioctx(current->mm, ioctx, &wait);
percpu_ref_put(&ioctx->users);
/* Wait until all IO for the context are done. Otherwise kernel * keep using user-space buffers even if user thinks the context * is destroyed.
*/ if (!ret)
wait_for_completion(&wait.comp);
staticint aio_prep_rw(struct kiocb *req, conststruct iocb *iocb, int rw_type)
{ int ret;
req->ki_write_stream = 0;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then * aio_reqprio is interpreted as an I/O scheduling * class and priority.
*/
ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) {
pr_debug("aio ioprio check cap error: %d\n", ret); return ret;
}
staticinlinevoid aio_rw_done(struct kiocb *req, ssize_t ret)
{ switch (ret) { case -EIOCBQUEUED: break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: case -ERESTART_RESTARTBLOCK: /* * There's no easy way to restart the syscall since other AIO's * may be already running. Just fail this IO with EINTR.
*/
ret = -EINTR;
fallthrough; default:
req->ki_complete(req, ret);
}
}
/* * Safely lock the waitqueue which the request is on, synchronizing with the * case where the ->poll() provider decides to free its waitqueue early. * * Returns true on success, meaning that req->head->lock was locked, req->wait * is on req->head, and an RCU read lock was taken. Returns false if the * request was already removed from its waitqueue (which might no longer exist).
*/ staticbool poll_iocb_lock_wq(struct poll_iocb *req)
{
wait_queue_head_t *head;
/* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us, then check whether the request is still on the queue. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed.
*/
rcu_read_lock();
head = smp_load_acquire(&req->head); if (head) {
spin_lock(&head->lock); if (!list_empty(&req->wait.entry)) returntrue;
spin_unlock(&head->lock);
}
rcu_read_unlock(); returnfalse;
}
if (!READ_ONCE(req->cancelled))
mask = vfs_poll(req->file, &pt) & req->events;
/* * Note that ->ki_cancel callers also delete iocb from active_reqs after * calling ->ki_cancel. We need the ctx_lock roundtrip here to * synchronize with them. In the cancellation case the list_del_init * itself is not actually needed, but harmless so we keep it in to * avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->ctx_lock); if (poll_iocb_lock_wq(req)) { if (!mask && !READ_ONCE(req->cancelled)) { /* * The request isn't actually ready to be completed yet. * Reschedule completion if another wakeup came in.
*/ if (req->work_need_resched) {
schedule_work(&req->work);
req->work_need_resched = false;
} else {
req->work_scheduled = false;
}
poll_iocb_unlock_wq(req);
spin_unlock_irq(&ctx->ctx_lock); return;
}
list_del_init(&req->wait.entry);
poll_iocb_unlock_wq(req);
} /* else, POLLFREE has freed the waitqueue, so we must complete */
list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb);
}
/* assumes we are called with irqs disabled */ staticint aio_poll_cancel(struct kiocb *iocb)
{ struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll;
if (poll_iocb_lock_wq(req)) {
WRITE_ONCE(req->cancelled, true); if (!req->work_scheduled) {
schedule_work(&aiocb->poll.work);
req->work_scheduled = true;
}
poll_iocb_unlock_wq(req);
} /* else, the request was force-cancelled by POLLFREE already */
/* for instances that support it check for an event match first: */ if (mask && !(mask & req->events)) return 0;
/* * Complete the request inline if possible. This requires that three * conditions be met: * 1. An event mask must have been passed. If a plain wakeup was done * instead, then mask == 0 and we have to call vfs_poll() to get * the events, so inline completion isn't possible. * 2. The completion work must not have already been scheduled. * 3. ctx_lock must not be busy. We have to use trylock because we * already hold the waitqueue lock, so this inverts the normal * locking order. Use irqsave/irqrestore because not all * filesystems (e.g. fuse) call this function with IRQs disabled, * yet IRQs have to be disabled before ctx_lock is obtained.
*/ if (mask && !req->work_scheduled &&
spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx;
list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask); if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
}
spin_unlock_irqrestore(&ctx->ctx_lock, flags); if (iocb)
iocb_put(iocb);
} else { /* * Schedule the completion work if needed. If it was already * scheduled, record that another wakeup came in. * * Don't remove the request from the waitqueue here, as it might * not actually be complete yet (we won't know until vfs_poll() * is called), and we must not miss any wakeups. POLLFREE is an * exception to this; see below.
*/ if (req->work_scheduled) {
req->work_need_resched = true;
} else {
schedule_work(&req->work);
req->work_scheduled = true;
}
/* * If the waitqueue is being freed early but we can't complete * the request inline, we have to tear down the request as best * we can. That means immediately removing the request from its * waitqueue and preventing all further accesses to the * waitqueue via the request. We also need to schedule the * completion work (done above). Also mark the request as * cancelled, to potentially skip an unneeded call to ->poll().
*/ if (mask & POLLFREE) {
WRITE_ONCE(req->cancelled, true);
list_del_init(&req->wait.entry);
/* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock.
*/
smp_store_release(&req->head, NULL);
}
} return 1;
}
/* reject any unknown events outside the normal event mask. */ if ((u16)iocb->aio_buf != iocb->aio_buf) return -EINVAL; /* reject fields that are not defined for poll */ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) return -EINVAL;
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
apt.queued = false;
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
INIT_LIST_HEAD(&req->wait.entry);
init_waitqueue_func_entry(&req->wait, aio_poll_wake);
if (!on_queue || req->work_scheduled) { /* * aio_poll_wake() already either scheduled the async * completion work, or completed the request inline.
*/ if (apt.error) /* unsupported case: multiple queues */
cancel = true;
apt.error = 0;
mask = 0;
} if (mask || apt.error) { /* Steal to complete synchronously. */
list_del_init(&req->wait.entry);
} elseif (cancel) { /* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true);
} elseif (on_queue) { /* * Actually waiting for an event, so add the request to * active_reqs so that it can be cancelled if needed.
*/
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel;
} if (on_queue)
poll_iocb_unlock_wq(req);
} if (mask) { /* no async, we'd stolen it */
aiocb->ki_res.res = mangle_poll(mask);
apt.error = 0;
}
spin_unlock_irq(&ctx->ctx_lock); if (mask)
iocb_put(aiocb); return apt.error;
}
if (iocb->aio_flags & IOCB_FLAG_RESFD) { struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function.
*/
eventfd = eventfd_ctx_fdget(iocb->aio_resfd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd);
req->ki_eventfd = eventfd;
}
if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
pr_debug("EFAULT: aio_key\n"); return -EFAULT;
}
/* Done with the synchronous reference */
iocb_put(req);
/* * If err is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves.
*/ if (unlikely(err)) {
iocb_destroy(req);
put_reqs_available(ctx, 1);
} return err;
}
/* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context * specified by ctx_id is invalid, if nr is < 0, if the iocb at * *iocbpp[0] is not properly initialized, if the operation specified * is invalid for the file descriptor in the iocb. May fail with * -EFAULT if any of the data structures point to invalid data. May * fail with -EBADF if the file descriptor specified in the first * iocb is invalid. May fail with -EAGAIN if insufficient resources * are available to queue any iocbs. Will return 0 if nr is 0. Will * fail with -ENOSYS if not implemented.
*/
SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct iocb __user * __user *, iocbpp)
{ struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug;
if (nr > AIO_PLUG_THRESHOLD)
blk_start_plug(&plug); for (i = 0; i < nr; i++) {
compat_uptr_t user_iocb;
if (unlikely(get_user(user_iocb, iocbpp + i))) {
ret = -EFAULT; break;
}
ret = io_submit_one(ctx, compat_ptr(user_iocb), true); if (ret) break;
} if (nr > AIO_PLUG_THRESHOLD)
blk_finish_plug(&plug);
percpu_ref_put(&ctx->users); return i ? i : ret;
} #endif
/* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is * copied into the memory pointed to by result without being placed * into the completion queue and 0 is returned. May fail with * -EFAULT if any of the data structures pointed to are invalid. * May fail with -EINVAL if aio_context specified by ctx_id is * invalid. May fail with -EAGAIN if the iocb specified was not * cancelled. Will fail with -ENOSYS if not implemented.
*/
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result)
{ struct kioctx *ctx; struct aio_kiocb *kiocb; int ret = -EINVAL;
u32 key;
u64 obj = (u64)(unsignedlong)iocb;
if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; if (unlikely(key != KIOCB_KEY)) return -EINVAL;
ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) {
ret = kiocb->ki_cancel(&kiocb->rw);
list_del_init(&kiocb->ki_list); break;
}
}
spin_unlock_irq(&ctx->ctx_lock);
if (!ret) { /* * The result argument is no longer used - the io_event is * always delivered via the ring buffer. -EINPROGRESS indicates * cancellation is progress:
*/
ret = -EINPROGRESS;
}
percpu_ref_put(&ctx->users);
return ret;
}
staticlong do_io_getevents(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct timespec64 *ts)
{
ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; struct kioctx *ioctx = lookup_ioctx(ctx_id); long ret = -EINVAL;
if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0))
ret = read_events(ioctx, min_nr, nr, events, until);
percpu_ref_put(&ioctx->users);
}
return ret;
}
/* io_getevents: * Attempts to read at least min_nr events and up to nr events from * the completion queue for the aio_context specified by ctx_id. If * it succeeds, the number of read events is returned. May fail with * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is * out of range, if timeout is out of range. May fail with -EFAULT * if any of the memory specified is invalid. May return 0 or * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented.
*/ #ifdef CONFIG_64BIT
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.