/* * LOCKING: * There are three level of locking required by epoll : * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. * We need a spinlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * The epnested_mutex is acquired when inserting an epoll fd onto another * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability.
*/
struct epoll_filefd { struct file *file; int fd;
} __packed;
/* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next;
/* The "base" pointer is set to the container "struct epitem" */ struct epitem *base;
/* * Wait queue item that will be linked to the target file wait * queue head.
*/
wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
};
/* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line.
*/ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink;
/* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items.
*/ struct epitem *next;
/* The file descriptor information this item refers to */ struct epoll_filefd ffd;
/* * Protected by file->f_lock, true for to-be-released epitem already * removed from the "struct file" items list; together with * eventpoll->refcount orchestrates "struct eventpoll" disposal
*/ bool dying;
/* List containing poll wait queues */ struct eppoll_entry *pwqlist;
/* The "container" of this item */ struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */ struct hlist_node fllink;
/* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */ struct epoll_event event;
};
/* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface.
*/ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations.
*/ struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */ struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
spinlock_t lock;
/* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr;
/* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock.
*/ struct epitem *ovflist;
/* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */ struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
u64 gen; struct hlist_head refs;
u8 loop_check_depth;
/* * usage count, used together with epitem->dying to * orchestrate the disposal of this struct
*/
refcount_t refcount;
/* Wrapper struct used by poll queueing */ struct ep_pqueue {
poll_table pt; struct epitem *epi;
};
/* * Configuration options available inside /proc/sys/fs/epoll/
*/ /* Maximum number of epoll watched descriptors, per user */ staticlong max_user_watches __read_mostly;
/* Used for cycles detection */ static DEFINE_MUTEX(epnested_mutex);
static u64 loop_check_gen = 0;
/* Used to check for epoll file descriptor inclusion loops */ staticstruct eventpoll *inserting_into;
/* Slab cache used to allocate "struct epitem" */ staticstruct kmem_cache *epi_cache __ro_after_init;
/* Slab cache used to allocate "struct eppoll_entry" */ staticstruct kmem_cache *pwq_cache __ro_after_init;
/* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epnested_mutex.
*/ struct epitems_head { struct hlist_head epitems; struct epitems_head *next;
}; staticstruct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
/* Setup the structure that is used as key for the RB tree */ staticinlinevoid ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd)
{
ffd->file = file;
ffd->fd = fd;
}
/* Get the "struct epitem" from a wait queue pointer */ staticinlinestruct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{ return container_of(p, struct eppoll_entry, wait)->base;
}
/** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise.
*/ staticinlineint ep_events_available(struct eventpoll *ep)
{ return !list_empty_careful(&ep->rdllist) ||
READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}
#ifdef CONFIG_NET_RX_BUSY_POLL /** * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value * from the epoll instance ep is preferred, but if it is not set fallback to * the system-wide global via busy_loop_timeout. * * @start_time: The start time used to compute the remaining time until timeout. * @ep: Pointer to the eventpoll context. * * Return: true if the timeout has expired, false otherwise.
*/ staticbool busy_loop_ep_timeout(unsignedlong start_time, struct eventpoll *ep)
{ unsignedlong bp_usec = READ_ONCE(ep->busy_poll_usecs);
if (bp_usec) { unsignedlong end_time = start_time + bp_usec; unsignedlong now = busy_loop_current_time();
/* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled
*/ staticbool ep_busy_loop(struct eventpoll *ep)
{ unsignedint napi_id = READ_ONCE(ep->napi_id);
u16 budget = READ_ONCE(ep->busy_poll_budget); bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
if (!budget)
budget = BUSY_POLL_BUDGET;
if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {
napi_busy_loop(napi_id, ep_busy_loop_end,
ep, prefer_busy_poll, budget); if (ep_events_available(ep)) returntrue; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list.
*/ if (prefer_busy_poll)
napi_resume_irqs(napi_id);
ep->napi_id = 0; returnfalse;
} returnfalse;
}
/* * Set epoll busy poll NAPI ID from sk.
*/ staticinlinevoid ep_set_busy_poll_napi_id(struct epitem *epi)
{ struct eventpoll *ep = epi->ep; unsignedint napi_id; struct socket *sock; struct sock *sk;
if (!ep_busy_loop_on(ep)) return;
sock = sock_from_file(epi->ffd.file); if (!sock) return;
sk = sock->sk; if (!sk) return;
napi_id = READ_ONCE(sk->sk_napi_id);
/* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID
*/ if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return;
/* record NAPI ID for use in next busy poll */
ep->napi_id = napi_id;
}
/* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll.
*/ #ifdef CONFIG_DEBUG_LOCK_ALLOC
/* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock.
*/ if (epi) { if ((is_file_epoll(epi->ffd.file))) {
ep_src = epi->ffd.file->private_data;
nests = ep_src->nests;
} else {
nests = 1;
}
}
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
ep->nests = nests + 1;
wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
ep->nests = 0;
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock.
*/
whead = smp_load_acquire(&pwq->whead); if (whead)
remove_wait_queue(whead, &pwq->wait);
rcu_read_unlock();
}
/* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held.
*/ staticvoid ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{ struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq;
/* call only when ep->mtx is held */ staticinlinestruct wakeup_source *ep_wakeup_source(struct epitem *epi)
{ return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}
/* call only when ep->mtx is held */ staticinlinevoid ep_pm_stay_awake(struct epitem *epi)
{ struct wakeup_source *ws = ep_wakeup_source(epi);
/* call when ep->mtx cannot be held (ep_poll_callback) */ staticinlinevoid ep_pm_stay_awake_rcu(struct epitem *epi)
{ struct wakeup_source *ws;
rcu_read_lock();
ws = rcu_dereference(epi->ws); if (ws)
__pm_stay_awake(ws);
rcu_read_unlock();
}
/* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl().
*/ staticvoid ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
{ /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way.
*/
lockdep_assert_irqs_enabled();
spin_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
spin_unlock_irq(&ep->lock);
}
spin_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here.
*/ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them.
*/ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO.
*/
list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
} /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist.
*/
WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
/* * Quickly re-inject items left on "txlist".
*/
list_splice(txlist, &ep->rdllist);
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
}
/* * Returns true if the event poll can be disposed
*/ staticbool ep_refcount_dec_and_test(struct eventpoll *ep)
{ if (!refcount_dec_and_test(&ep->refcount)) returnfalse;
/* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. * If the dying flag is set, do the removal only if force is true. * This prevents ep_clear_and_put() from dropping all the ep references * while running concurrently with eventpoll_release_file(). * Returns true if the eventpoll can be disposed.
*/ staticbool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
{ struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head;
/* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock); if (epi->dying && !force) {
spin_unlock(&file->f_lock); returnfalse;
}
to_free = NULL;
head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { /* See eventpoll_release() for details. */
WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v;
v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next))
to_free = v;
}
}
hlist_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
free_ephead(to_free);
rb_erase_cached(&epi->rbn, &ep->rbr);
spin_lock_irq(&ep->lock); if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
spin_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field.
*/
kfree_rcu(epi, rcu);
/* * ep_remove variant for callers owing an additional reference to the ep
*/ staticvoid ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{ if (__ep_remove(ep, epi, false))
WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}
/* * Walks through the whole tree and try to free each "struct epitem". * Note that ep_remove_safe() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't * dispose it.
*/ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
next = rb_next(rbp);
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove_safe(ep, epi);
cond_resched();
}
mutex_unlock(&ep->mtx); if (ep_refcount_dec_and_test(ep))
ep_free(ep);
}
/* * Proceed to find out if wanted events are really available inside * the ready list.
*/
mutex_lock_nested(&ep->mtx, depth);
ep_start_scan(ep, &txlist);
list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) {
res = EPOLLIN | EPOLLRDNORM; break;
} else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here.
*/
__pm_relax(ep_wakeup_source(epi));
list_del_init(&epi->rdllink);
}
}
ep_done_scan(ep, &txlist);
mutex_unlock(&ep->mtx); return res;
}
/* * The ffd.file pointer may be in the process of being torn down due to * being closed, but we may not have finished eventpoll_release() yet. * * Normally, even with the atomic_long_inc_not_zero, the file may have * been free'd and then gotten re-allocated to something else (since * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). * * But for epoll, users hold the ep->mtx mutex, and as such any file in * the process of being free'd will block in eventpoll_release_file() * and thus the underlying file allocation will not be free'd, and the * file re-use cannot happen. * * For the same reason we can avoid a rcu_read_lock() around the * operation - 'ffd.file' cannot go away even if the refcount has * reached zero (but we must still not call out to ->poll() functions * etc).
*/ staticstruct file *epi_fget(conststruct epitem *epi)
{ struct file *file;
/* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated.
*/ static __poll_t ep_item_poll(conststruct epitem *epi, poll_table *pt, int depth)
{ struct file *file = epi_fget(epi);
__poll_t res;
/* * We could return EPOLLERR | EPOLLHUP or something, but let's * treat this more as "file doesn't exist, poll didn't happen".
*/ if (!file) return 0;
pt->_key = epi->event.events; if (!is_file_epoll(file))
res = vfs_poll(file, pt); else
res = __ep_eventpoll_poll(file, pt, depth);
fput(file); return res & epi->event.events;
}
/* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface.
*/ void eventpoll_release_file(struct file *file)
{ struct eventpoll *ep; struct epitem *epi; bool dispose;
/* * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from * touching the epitems list before eventpoll_release_file() can access * the ep->mtx.
*/
again:
spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) {
epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
epi->dying = true;
spin_unlock(&file->f_lock);
/* * ep access is safe as we still own a reference to the ep * struct
*/
ep = epi->ep;
mutex_lock(&ep->mtx);
dispose = __ep_remove(ep, epi, true);
mutex_unlock(&ep->mtx);
if (dispose && ep_refcount_dec_and_test(ep))
ep_free(ep); goto again;
}
spin_unlock(&file->f_lock);
}
/* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held.
*/ staticstruct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{ int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd;
/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report.
*/ staticint ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{ int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key); unsignedlong flags; int ewake = 0;
spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
/* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued.
*/ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock;
/* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test.
*/ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock;
/* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on.
*/ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) {
epi->next = READ_ONCE(ep->ovflist);
WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
}
} elseif (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
/* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list.
*/ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) &&
!(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN)
ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT)
ewake = 1; break; case 0:
ewake = 1; break;
}
} if (sync)
wake_up_sync(&ep->wq); else
wake_up(&ep->wq);
} if (waitqueue_active(&ep->poll_wait))
pwake++;
/* We have to call this outside the lock */ if (pwake)
ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue().
*/
list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() * takes whead->lock held by the caller. Once we nullify it, * nothing protects ep/epi or even wait.
*/
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
}
return ewake;
}
/* * This is the callback that is used to add our wait queue to the * target file wakeup lists.
*/ staticvoid ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{ struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq;
if (unlikely(!epi)) // an earlier allocation has failed return;
while (*p) {
parent = *p;
epic = rb_entry(parent, struct epitem, rbn);
kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) {
p = &parent->rb_right;
leftmost = false;
} else
p = &parent->rb_left;
}
rb_link_node(&epi->rbn, parent, p);
rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}
#define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epnested_mutex.
*/ staticconstint path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; staticint path_count[PATH_ARR_SIZE];
staticint path_count_inc(int nests)
{ /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0;
if (++path_count[nests] > path_limits[nests]) return -1; return 0;
}
staticvoid path_count_init(void)
{ int i;
for (i = 0; i < PATH_ARR_SIZE; i++)
path_count[i] = 0;
}
staticint reverse_path_check_proc(struct hlist_head *refs, int depth)
{ int error = 0; struct epitem *epi;
if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1;
/* CTL_DEL can remove links here, but that can't increase our count */
hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs))
error = path_count_inc(depth); else
error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break;
} return error;
}
/** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise.
*/ staticint reverse_path_check(void)
{ struct epitems_head *p;
for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error;
path_count_init();
rcu_read_lock();
error = reverse_path_check_proc(&p->epitems, 0);
rcu_read_unlock(); if (error) return error;
} return 0;
}
if (!ws) return -ENOMEM;
rcu_assign_pointer(epi->ws, ws);
return 0;
}
/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{ struct wakeup_source *ws = ep_wakeup_source(epi);
RCU_INIT_POINTER(epi->ws, NULL);
/* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu
*/
synchronize_rcu();
wakeup_source_unregister(ws);
}
if (tep)
mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep)
mutex_unlock(&tep->mtx);
kmem_cache_free(epi_cache, epi);
percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM;
}
if (full_check && !tep)
list_file(tfile);
/* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi); if (tep)
mutex_unlock(&tep->mtx);
/* * ep_remove_safe() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference.
*/
ep_get(ep);
/* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) {
ep_remove_safe(ep, epi); return -EINVAL;
}
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi); if (error) {
ep_remove_safe(ep, epi); return error;
}
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item.
*/
revents = ep_item_poll(epi, &epq.pt, 1);
/* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure.
*/ if (unlikely(!epq.epi)) {
ep_remove_safe(ep, epi); return -ENOMEM;
}
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq))
wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irq(&ep->lock);
/* We have to call this outside the lock */ if (pwake)
ep_poll_safewake(ep, NULL, 0);
return 0;
}
/* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held.
*/ staticint ep_modify(struct eventpoll *ep, struct epitem *epi, conststruct epoll_event *event)
{ int pwake = 0;
poll_table pt;
lockdep_assert_irqs_enabled();
init_poll_funcptr(&pt, NULL);
/* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events; /* need barrier below */
epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi))
ep_create_wakeup_source(epi);
} elseif (ep_has_wakeup_source(epi)) {
ep_destroy_wakeup_source(epi);
}
/* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item.
*/
smp_mb();
/* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside.
*/ if (ep_item_poll(epi, &pt, 1)) {
spin_lock_irq(&ep->lock); if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq))
wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */ if (pwake)
ep_poll_safewake(ep, NULL, 0);
return 0;
}
staticint ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents)
{ struct epitem *epi, *tmp;
LIST_HEAD(txlist);
poll_table pt; int res = 0;
/* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly.
*/ if (fatal_signal_pending(current)) return -EINTR;
init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx);
ep_start_scan(ep, &txlist);
/* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx.
*/
list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws;
__poll_t revents;
if (res >= maxevents) break;
/* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked().
*/
ws = ep_wakeup_source(epi); if (ws) { if (ws->active)
__pm_stay_awake(ep->ws);
__pm_relax(ws);
}
list_del_init(&epi->rdllink);
/* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item.
*/
revents = ep_item_poll(epi, &pt, 1); if (!revents) continue;
events = epoll_put_uevent(revents, epi->event.data, events); if (!events) {
list_add(&epi->rdllink, &txlist);
ep_pm_stay_awake(epi); if (!res)
res = -EFAULT; break;
}
res++; if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS; elseif (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
ep_done_scan(ep, &txlist);
mutex_unlock(&ep->mtx);
/* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it.
*/ staticint ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsignedint mode, int sync, void *key)
{ int ret = default_wake_function(wq_entry, mode, sync, key);
/* * Pairs with list_empty_careful in ep_poll, and ensures future loop * iterations see the cause of this wakeup.
*/
list_del_init_careful(&wq_entry->entry); return ret;
}
staticint ep_try_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents)
{ int res;
/* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck.
*/
res = ep_send_events(ep, events, maxevents); if (res > 0)
ep_suspend_napi_irqs(ep); return res;
}
/** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error.
*/ staticint ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout)
{ int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
to = &expires;
*to = timespec64_to_ktime(*timeout);
} elseif (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation.
*/
timed_out = 1;
}
/* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again.
*/
eavail = ep_events_available(ep);
while (1) { if (eavail) {
res = ep_try_send_events(ep, events, maxevents); if (res) return res;
}
if (timed_out) return 0;
eavail = ep_busy_loop(ep); if (eavail) continue;
if (signal_pending(current)) return -EINTR;
/* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally.
*/
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
spin_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier.
*/
__set_current_state(TASK_INTERRUPTIBLE);
/* * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important.
*/
eavail = ep_events_available(ep); if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
spin_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !ep_schedule_timeout(to) ||
!schedule_hrtimeout_range(to, slack,
HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below.
*/
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
spin_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events.
*/ if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
spin_unlock_irq(&ep->lock);
}
}
}
/** * ep_loop_check_proc - verify that adding an epoll file @ep inside another * epoll file does not create closed loops, and * determine the depth of the subtree starting at @ep * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
*/ staticint ep_loop_check_proc(struct eventpoll *ep, int depth)
{ int result = 0; struct rb_node *rbp; struct epitem *epi;
if (ep->gen == loop_check_gen) return ep->loop_check_depth;
mutex_lock_nested(&ep->mtx, depth + 1);
ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
result = INT_MAX; else
result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); if (result > EP_MAX_NESTS) break;
} else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert().
*/
list_file(epi->ffd.file);
}
}
ep->loop_check_depth = result;
mutex_unlock(&ep->mtx);
return result;
}
/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */ staticint ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
{ int result = 0; struct epitem *epi;
/** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise.
*/ staticint ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{ int depth, upwards_depth;
inserting_into = ep; /* * Check how deep down we can get from @to, and whether it is possible * to loop up to @ep.
*/
depth = ep_loop_check_proc(to, 0); if (depth > EP_MAX_NESTS) return -1; /* Check how far up we can go from @ep. */
rcu_read_lock();
upwards_depth = ep_get_upwards_depth_proc(ep, 0);
rcu_read_unlock();
staticinlineint epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock)
{ if (!nonblock) {
mutex_lock_nested(mutex, depth); return 0;
} if (mutex_trylock(mutex)) return 0; return -EAGAIN;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock)
{ int error; int full_check = 0; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL;
CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF;
/* Get the "struct file *" for the target file */ CLASS(fd, tf)(fd); if (fd_empty(tf)) return -EBADF;
/* The target file descriptor must support poll */ if (!file_can_poll(fd_file(tf))) return -EPERM;
/* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(epds);
/* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself.
*/
error = -EINVAL; if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput;
/* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups.
*/ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput;
}
/* * At this point it is safe to assume that the "private_data" contains * our own data structure.
*/
ep = fd_file(f)->private_data;
/* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations.
*/
error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
is_file_epoll(fd_file(tf))) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput;
loop_check_gen++;
full_check = 1; if (is_file_epoll(fd_file(tf))) {
tep = fd_file(tf)->private_data;
error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput;
}
error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput;
}
}
/* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex.
*/
epi = ep_find(ep, fd_file(tf), fd);
error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
} else
error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) { /* * The eventpoll itself is still alive: the refcount * can't go to zero here.
*/
ep_remove_safe(ep, epi);
error = 0;
} else {
error = -ENOENT;
} break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT; break;
}
mutex_unlock(&ep->mtx);
/* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event)
{ struct epoll_event epds;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT;
staticint ep_check_params(struct file *file, struct epoll_event __user *evs, int maxevents)
{ /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL;
/* Verify that the area passed by the user is writeable */ if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) return -EFAULT;
/* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file.
*/ if (!is_file_epoll(file)) return -EINVAL;
return 0;
}
--> --------------------
--> maximum size reached
--> --------------------
¤ Dauer der Verarbeitung: 0.32 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.