source: src/linux/universal/linux-3.2/fs/eventpoll.c @ 18171

Last change on this file since 18171 was 18171, checked in by BrainSlayer, 16 months ago

this kernel will be maintained for all targets, so target specific kernel trees will not be neccessary anymore in future

File size: 44.5 KB
Line 
1/*
2 *  fs/eventpoll.c (Efficient event retrieval implementation)
3 *  Copyright (C) 2001,...,2009  Davide Libenzi
4 *
5 *  This program is free software; you can redistribute it and/or modify
6 *  it under the terms of the GNU General Public License as published by
7 *  the Free Software Foundation; either version 2 of the License, or
8 *  (at your option) any later version.
9 *
10 *  Davide Libenzi <davidel@xmailserver.org>
11 *
12 */
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <asm/uaccess.h>
37#include <asm/system.h>
38#include <asm/io.h>
39#include <asm/mman.h>
40#include <linux/atomic.h>
41
42/*
43 * LOCKING:
44 * There are three level of locking required by epoll :
45 *
46 * 1) epmutex (mutex)
47 * 2) ep->mtx (mutex)
48 * 3) ep->lock (spinlock)
49 *
50 * The acquire order is the one listed above, from 1 to 3.
51 * We need a spinlock (ep->lock) because we manipulate objects
52 * from inside the poll callback, that might be triggered from
53 * a wake_up() that in turn might be called from IRQ context.
54 * So we can't sleep inside the poll callback and hence we need
55 * a spinlock. During the event transfer loop (from kernel to
56 * user space) we could end up sleeping due a copy_to_user(), so
57 * we need a lock that will allow us to sleep. This lock is a
58 * mutex (ep->mtx). It is acquired during the event transfer loop,
59 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
60 * Then we also need a global mutex to serialize eventpoll_release_file()
61 * and ep_free().
62 * This mutex is acquired by ep_free() during the epoll file
63 * cleanup path and it is also acquired by eventpoll_release_file()
64 * if a file has been pushed inside an epoll set and it is then
65 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
66 * It is also acquired when inserting an epoll fd onto another epoll
67 * fd. We do this so that we walk the epoll tree and ensure that this
68 * insertion does not create a cycle of epoll file descriptors, which
69 * could lead to deadlock. We need a global mutex to prevent two
70 * simultaneous inserts (A into B and B into A) from racing and
71 * constructing a cycle without either insert observing that it is
72 * going to.
73 * It is necessary to acquire multiple "ep->mtx"es at once in the
74 * case when one epoll fd is added to another. In this case, we
75 * always acquire the locks in the order of nesting (i.e. after
76 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
77 * before e2->mtx). Since we disallow cycles of epoll file
78 * descriptors, this ensures that the mutexes are well-ordered. In
79 * order to communicate this nesting to lockdep, when walking a tree
80 * of epoll file descriptors, we use the current recursion depth as
81 * the lockdep subkey.
82 * It is possible to drop the "ep->mtx" and to use the global
83 * mutex "epmutex" (together with "ep->lock") to have it working,
84 * but having "ep->mtx" will make the interface more scalable.
85 * Events that require holding "epmutex" are very rare, while for
86 * normal operations the epoll private "ep->mtx" will guarantee
87 * a better scalability.
88 */
89
90/* Epoll private bits inside the event mask */
91#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
92
93/* Maximum number of nesting allowed inside epoll sets */
94#define EP_MAX_NESTS 4
95
96#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
97
98#define EP_UNACTIVE_PTR ((void *) -1L)
99
100#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
101
102struct epoll_filefd {
103        struct file *file;
104        int fd;
105};
106
107/*
108 * Structure used to track possible nested calls, for too deep recursions
109 * and loop cycles.
110 */
111struct nested_call_node {
112        struct list_head llink;
113        void *cookie;
114        void *ctx;
115};
116
117/*
118 * This structure is used as collector for nested calls, to check for
119 * maximum recursion dept and loop cycles.
120 */
121struct nested_calls {
122        struct list_head tasks_call_list;
123        spinlock_t lock;
124};
125
126/*
127 * Each file descriptor added to the eventpoll interface will
128 * have an entry of this type linked to the "rbr" RB tree.
129 */
130struct epitem {
131        /* RB tree node used to link this structure to the eventpoll RB tree */
132        struct rb_node rbn;
133
134        /* List header used to link this structure to the eventpoll ready list */
135        struct list_head rdllink;
136
137        /*
138         * Works together "struct eventpoll"->ovflist in keeping the
139         * single linked chain of items.
140         */
141        struct epitem *next;
142
143        /* The file descriptor information this item refers to */
144        struct epoll_filefd ffd;
145
146        /* Number of active wait queue attached to poll operations */
147        int nwait;
148
149        /* List containing poll wait queues */
150        struct list_head pwqlist;
151
152        /* The "container" of this item */
153        struct eventpoll *ep;
154
155        /* List header used to link this item to the "struct file" items list */
156        struct list_head fllink;
157
158        /* The structure that describe the interested events and the source fd */
159        struct epoll_event event;
160};
161
162/*
163 * This structure is stored inside the "private_data" member of the file
164 * structure and represents the main data structure for the eventpoll
165 * interface.
166 */
167struct eventpoll {
168        /* Protect the access to this structure */
169        spinlock_t lock;
170
171        /*
172         * This mutex is used to ensure that files are not removed
173         * while epoll is using them. This is held during the event
174         * collection loop, the file cleanup path, the epoll file exit
175         * code and the ctl operations.
176         */
177        struct mutex mtx;
178
179        /* Wait queue used by sys_epoll_wait() */
180        wait_queue_head_t wq;
181
182        /* Wait queue used by file->poll() */
183        wait_queue_head_t poll_wait;
184
185        /* List of ready file descriptors */
186        struct list_head rdllist;
187
188        /* RB tree root used to store monitored fd structs */
189        struct rb_root rbr;
190
191        /*
192         * This is a single linked list that chains all the "struct epitem" that
193         * happened while transferring ready events to userspace w/out
194         * holding ->lock.
195         */
196        struct epitem *ovflist;
197
198        /* The user that created the eventpoll descriptor */
199        struct user_struct *user;
200};
201
202/* Wait structure used by the poll hooks */
203struct eppoll_entry {
204        /* List header used to link this structure to the "struct epitem" */
205        struct list_head llink;
206
207        /* The "base" pointer is set to the container "struct epitem" */
208        struct epitem *base;
209
210        /*
211         * Wait queue item that will be linked to the target file wait
212         * queue head.
213         */
214        wait_queue_t wait;
215
216        /* The wait queue head that linked the "wait" wait queue item */
217        wait_queue_head_t *whead;
218};
219
220/* Wrapper struct used by poll queueing */
221struct ep_pqueue {
222        poll_table pt;
223        struct epitem *epi;
224};
225
226/* Used by the ep_send_events() function as callback private data */
227struct ep_send_events_data {
228        int maxevents;
229        struct epoll_event __user *events;
230};
231
232/*
233 * Configuration options available inside /proc/sys/fs/epoll/
234 */
235/* Maximum number of epoll watched descriptors, per user */
236static long max_user_watches __read_mostly;
237
238/*
239 * This mutex is used to serialize ep_free() and eventpoll_release_file().
240 */
241static DEFINE_MUTEX(epmutex);
242
243/* Used to check for epoll file descriptor inclusion loops */
244static struct nested_calls poll_loop_ncalls;
245
246/* Used for safe wake up implementation */
247static struct nested_calls poll_safewake_ncalls;
248
249/* Used to call file's f_op->poll() under the nested calls boundaries */
250static struct nested_calls poll_readywalk_ncalls;
251
252/* Slab cache used to allocate "struct epitem" */
253static struct kmem_cache *epi_cache __read_mostly;
254
255/* Slab cache used to allocate "struct eppoll_entry" */
256static struct kmem_cache *pwq_cache __read_mostly;
257
258#ifdef CONFIG_SYSCTL
259
260#include <linux/sysctl.h>
261
262static long zero;
263static long long_max = LONG_MAX;
264
265ctl_table epoll_table[] = {
266        {
267                .procname       = "max_user_watches",
268                .data           = &max_user_watches,
269                .maxlen         = sizeof(max_user_watches),
270                .mode           = 0644,
271                .proc_handler   = proc_doulongvec_minmax,
272                .extra1         = &zero,
273                .extra2         = &long_max,
274        },
275        { }
276};
277#endif /* CONFIG_SYSCTL */
278
279
280/* Setup the structure that is used as key for the RB tree */
281static inline void ep_set_ffd(struct epoll_filefd *ffd,
282                              struct file *file, int fd)
283{
284        ffd->file = file;
285        ffd->fd = fd;
286}
287
288/* Compare RB tree keys */
289static inline int ep_cmp_ffd(struct epoll_filefd *p1,
290                             struct epoll_filefd *p2)
291{
292        return (p1->file > p2->file ? +1:
293                (p1->file < p2->file ? -1 : p1->fd - p2->fd));
294}
295
296/* Tells us if the item is currently linked */
297static inline int ep_is_linked(struct list_head *p)
298{
299        return !list_empty(p);
300}
301
302/* Get the "struct epitem" from a wait queue pointer */
303static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
304{
305        return container_of(p, struct eppoll_entry, wait)->base;
306}
307
308/* Get the "struct epitem" from an epoll queue wrapper */
309static inline struct epitem *ep_item_from_epqueue(poll_table *p)
310{
311        return container_of(p, struct ep_pqueue, pt)->epi;
312}
313
314/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
315static inline int ep_op_has_event(int op)
316{
317        return op != EPOLL_CTL_DEL;
318}
319
320/* Initialize the poll safe wake up structure */
321static void ep_nested_calls_init(struct nested_calls *ncalls)
322{
323        INIT_LIST_HEAD(&ncalls->tasks_call_list);
324        spin_lock_init(&ncalls->lock);
325}
326
327/**
328 * ep_events_available - Checks if ready events might be available.
329 *
330 * @ep: Pointer to the eventpoll context.
331 *
332 * Returns: Returns a value different than zero if ready events are available,
333 *          or zero otherwise.
334 */
335static inline int ep_events_available(struct eventpoll *ep)
336{
337        return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
338}
339
340/**
341 * ep_call_nested - Perform a bound (possibly) nested call, by checking
342 *                  that the recursion limit is not exceeded, and that
343 *                  the same nested call (by the meaning of same cookie) is
344 *                  no re-entered.
345 *
346 * @ncalls: Pointer to the nested_calls structure to be used for this call.
347 * @max_nests: Maximum number of allowed nesting calls.
348 * @nproc: Nested call core function pointer.
349 * @priv: Opaque data to be passed to the @nproc callback.
350 * @cookie: Cookie to be used to identify this nested call.
351 * @ctx: This instance context.
352 *
353 * Returns: Returns the code returned by the @nproc callback, or -1 if
354 *          the maximum recursion limit has been exceeded.
355 */
356static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
357                          int (*nproc)(void *, void *, int), void *priv,
358                          void *cookie, void *ctx)
359{
360        int error, call_nests = 0;
361        unsigned long flags;
362        struct list_head *lsthead = &ncalls->tasks_call_list;
363        struct nested_call_node *tncur;
364        struct nested_call_node tnode;
365
366        spin_lock_irqsave(&ncalls->lock, flags);
367
368        /*
369         * Try to see if the current task is already inside this wakeup call.
370         * We use a list here, since the population inside this set is always
371         * very much limited.
372         */
373        list_for_each_entry(tncur, lsthead, llink) {
374                if (tncur->ctx == ctx &&
375                    (tncur->cookie == cookie || ++call_nests > max_nests)) {
376                        /*
377                         * Ops ... loop detected or maximum nest level reached.
378                         * We abort this wake by breaking the cycle itself.
379                         */
380                        error = -1;
381                        goto out_unlock;
382                }
383        }
384
385        /* Add the current task and cookie to the list */
386        tnode.ctx = ctx;
387        tnode.cookie = cookie;
388        list_add(&tnode.llink, lsthead);
389
390        spin_unlock_irqrestore(&ncalls->lock, flags);
391
392        /* Call the nested function */
393        error = (*nproc)(priv, cookie, call_nests);
394
395        /* Remove the current task from the list */
396        spin_lock_irqsave(&ncalls->lock, flags);
397        list_del(&tnode.llink);
398out_unlock:
399        spin_unlock_irqrestore(&ncalls->lock, flags);
400
401        return error;
402}
403
404#ifdef CONFIG_DEBUG_LOCK_ALLOC
405static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
406                                     unsigned long events, int subclass)
407{
408        unsigned long flags;
409
410        spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
411        wake_up_locked_poll(wqueue, events);
412        spin_unlock_irqrestore(&wqueue->lock, flags);
413}
414#else
415static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
416                                     unsigned long events, int subclass)
417{
418        wake_up_poll(wqueue, events);
419}
420#endif
421
422static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
423{
424        ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
425                          1 + call_nests);
426        return 0;
427}
428
429/*
430 * Perform a safe wake up of the poll wait list. The problem is that
431 * with the new callback'd wake up system, it is possible that the
432 * poll callback is reentered from inside the call to wake_up() done
433 * on the poll wait queue head. The rule is that we cannot reenter the
434 * wake up code from the same task more than EP_MAX_NESTS times,
435 * and we cannot reenter the same wait queue head at all. This will
436 * enable to have a hierarchy of epoll file descriptor of no more than
437 * EP_MAX_NESTS deep.
438 */
439static void ep_poll_safewake(wait_queue_head_t *wq)
440{
441        int this_cpu = get_cpu();
442
443        ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
444                       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
445
446        put_cpu();
447}
448
449/*
450 * This function unregisters poll callbacks from the associated file
451 * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
452 * ep_free).
453 */
454static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
455{
456        struct list_head *lsthead = &epi->pwqlist;
457        struct eppoll_entry *pwq;
458
459        while (!list_empty(lsthead)) {
460                pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
461
462                list_del(&pwq->llink);
463                remove_wait_queue(pwq->whead, &pwq->wait);
464                kmem_cache_free(pwq_cache, pwq);
465        }
466}
467
468/**
469 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
470 *                      the scan code, to call f_op->poll(). Also allows for
471 *                      O(NumReady) performance.
472 *
473 * @ep: Pointer to the epoll private data structure.
474 * @sproc: Pointer to the scan callback.
475 * @priv: Private opaque data passed to the @sproc callback.
476 * @depth: The current depth of recursive f_op->poll calls.
477 *
478 * Returns: The same integer error code returned by the @sproc callback.
479 */
480static int ep_scan_ready_list(struct eventpoll *ep,
481                              int (*sproc)(struct eventpoll *,
482                                           struct list_head *, void *),
483                              void *priv,
484                              int depth)
485{
486        int error, pwake = 0;
487        unsigned long flags;
488        struct epitem *epi, *nepi;
489        LIST_HEAD(txlist);
490
491        /*
492         * We need to lock this because we could be hit by
493         * eventpoll_release_file() and epoll_ctl().
494         */
495        mutex_lock_nested(&ep->mtx, depth);
496
497        /*
498         * Steal the ready list, and re-init the original one to the
499         * empty list. Also, set ep->ovflist to NULL so that events
500         * happening while looping w/out locks, are not lost. We cannot
501         * have the poll callback to queue directly on ep->rdllist,
502         * because we want the "sproc" callback to be able to do it
503         * in a lockless way.
504         */
505        spin_lock_irqsave(&ep->lock, flags);
506        list_splice_init(&ep->rdllist, &txlist);
507        ep->ovflist = NULL;
508        spin_unlock_irqrestore(&ep->lock, flags);
509
510        /*
511         * Now call the callback function.
512         */
513        error = (*sproc)(ep, &txlist, priv);
514
515        spin_lock_irqsave(&ep->lock, flags);
516        /*
517         * During the time we spent inside the "sproc" callback, some
518         * other events might have been queued by the poll callback.
519         * We re-insert them inside the main ready-list here.
520         */
521        for (nepi = ep->ovflist; (epi = nepi) != NULL;
522             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
523                /*
524                 * We need to check if the item is already in the list.
525                 * During the "sproc" callback execution time, items are
526                 * queued into ->ovflist but the "txlist" might already
527                 * contain them, and the list_splice() below takes care of them.
528                 */
529                if (!ep_is_linked(&epi->rdllink))
530                        list_add_tail(&epi->rdllink, &ep->rdllist);
531        }
532        /*
533         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
534         * releasing the lock, events will be queued in the normal way inside
535         * ep->rdllist.
536         */
537        ep->ovflist = EP_UNACTIVE_PTR;
538
539        /*
540         * Quickly re-inject items left on "txlist".
541         */
542        list_splice(&txlist, &ep->rdllist);
543
544        if (!list_empty(&ep->rdllist)) {
545                /*
546                 * Wake up (if active) both the eventpoll wait list and
547                 * the ->poll() wait list (delayed after we release the lock).
548                 */
549                if (waitqueue_active(&ep->wq))
550                        wake_up_locked(&ep->wq);
551                if (waitqueue_active(&ep->poll_wait))
552                        pwake++;
553        }
554        spin_unlock_irqrestore(&ep->lock, flags);
555
556        mutex_unlock(&ep->mtx);
557
558        /* We have to call this outside the lock */
559        if (pwake)
560                ep_poll_safewake(&ep->poll_wait);
561
562        return error;
563}
564
565/*
566 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
567 * all the associated resources. Must be called with "mtx" held.
568 */
569static int ep_remove(struct eventpoll *ep, struct epitem *epi)
570{
571        unsigned long flags;
572        struct file *file = epi->ffd.file;
573
574        /*
575         * Removes poll wait queue hooks. We _have_ to do this without holding
576         * the "ep->lock" otherwise a deadlock might occur. This because of the
577         * sequence of the lock acquisition. Here we do "ep->lock" then the wait
578         * queue head lock when unregistering the wait queue. The wakeup callback
579         * will run by holding the wait queue head lock and will call our callback
580         * that will try to get "ep->lock".
581         */
582        ep_unregister_pollwait(ep, epi);
583
584        /* Remove the current item from the list of epoll hooks */
585        spin_lock(&file->f_lock);
586        if (ep_is_linked(&epi->fllink))
587                list_del_init(&epi->fllink);
588        spin_unlock(&file->f_lock);
589
590        rb_erase(&epi->rbn, &ep->rbr);
591
592        spin_lock_irqsave(&ep->lock, flags);
593        if (ep_is_linked(&epi->rdllink))
594                list_del_init(&epi->rdllink);
595        spin_unlock_irqrestore(&ep->lock, flags);
596
597        /* At this point it is safe to free the eventpoll item */
598        kmem_cache_free(epi_cache, epi);
599
600        atomic_long_dec(&ep->user->epoll_watches);
601
602        return 0;
603}
604
605static void ep_free(struct eventpoll *ep)
606{
607        struct rb_node *rbp;
608        struct epitem *epi;
609
610        /* We need to release all tasks waiting for these file */
611        if (waitqueue_active(&ep->poll_wait))
612                ep_poll_safewake(&ep->poll_wait);
613
614        /*
615         * We need to lock this because we could be hit by
616         * eventpoll_release_file() while we're freeing the "struct eventpoll".
617         * We do not need to hold "ep->mtx" here because the epoll file
618         * is on the way to be removed and no one has references to it
619         * anymore. The only hit might come from eventpoll_release_file() but
620         * holding "epmutex" is sufficient here.
621         */
622        mutex_lock(&epmutex);
623
624        /*
625         * Walks through the whole tree by unregistering poll callbacks.
626         */
627        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
628                epi = rb_entry(rbp, struct epitem, rbn);
629
630                ep_unregister_pollwait(ep, epi);
631        }
632
633        /*
634         * Walks through the whole tree by freeing each "struct epitem". At this
635         * point we are sure no poll callbacks will be lingering around, and also by
636         * holding "epmutex" we can be sure that no file cleanup code will hit
637         * us during this operation. So we can avoid the lock on "ep->lock".
638         */
639        while ((rbp = rb_first(&ep->rbr)) != NULL) {
640                epi = rb_entry(rbp, struct epitem, rbn);
641                ep_remove(ep, epi);
642        }
643
644        mutex_unlock(&epmutex);
645        mutex_destroy(&ep->mtx);
646        free_uid(ep->user);
647        kfree(ep);
648}
649
650static int ep_eventpoll_release(struct inode *inode, struct file *file)
651{
652        struct eventpoll *ep = file->private_data;
653
654        if (ep)
655                ep_free(ep);
656
657        return 0;
658}
659
660static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
661                               void *priv)
662{
663        struct epitem *epi, *tmp;
664
665        list_for_each_entry_safe(epi, tmp, head, rdllink) {
666                if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
667                    epi->event.events)
668                        return POLLIN | POLLRDNORM;
669                else {
670                        /*
671                         * Item has been dropped into the ready list by the poll
672                         * callback, but it's not actually ready, as far as
673                         * caller requested events goes. We can remove it here.
674                         */
675                        list_del_init(&epi->rdllink);
676                }
677        }
678
679        return 0;
680}
681
682static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
683{
684        return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
685}
686
687static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
688{
689        int pollflags;
690        struct eventpoll *ep = file->private_data;
691
692        /* Insert inside our poll wait queue */
693        poll_wait(file, &ep->poll_wait, wait);
694
695        /*
696         * Proceed to find out if wanted events are really available inside
697         * the ready list. This need to be done under ep_call_nested()
698         * supervision, since the call to f_op->poll() done on listed files
699         * could re-enter here.
700         */
701        pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
702                                   ep_poll_readyevents_proc, ep, ep, current);
703
704        return pollflags != -1 ? pollflags : 0;
705}
706
707/* File callbacks that implement the eventpoll file behaviour */
708static const struct file_operations eventpoll_fops = {
709        .release        = ep_eventpoll_release,
710        .poll           = ep_eventpoll_poll,
711        .llseek         = noop_llseek,
712};
713
714/* Fast test to see if the file is an eventpoll file */
715static inline int is_file_epoll(struct file *f)
716{
717        return f->f_op == &eventpoll_fops;
718}
719
720/*
721 * This is called from eventpoll_release() to unlink files from the eventpoll
722 * interface. We need to have this facility to cleanup correctly files that are
723 * closed without being removed from the eventpoll interface.
724 */
725void eventpoll_release_file(struct file *file)
726{
727        struct list_head *lsthead = &file->f_ep_links;
728        struct eventpoll *ep;
729        struct epitem *epi;
730
731        /*
732         * We don't want to get "file->f_lock" because it is not
733         * necessary. It is not necessary because we're in the "struct file"
734         * cleanup path, and this means that no one is using this file anymore.
735         * So, for example, epoll_ctl() cannot hit here since if we reach this
736         * point, the file counter already went to zero and fget() would fail.
737         * The only hit might come from ep_free() but by holding the mutex
738         * will correctly serialize the operation. We do need to acquire
739         * "ep->mtx" after "epmutex" because ep_remove() requires it when called
740         * from anywhere but ep_free().
741         *
742         * Besides, ep_remove() acquires the lock, so we can't hold it here.
743         */
744        mutex_lock(&epmutex);
745
746        while (!list_empty(lsthead)) {
747                epi = list_first_entry(lsthead, struct epitem, fllink);
748
749                ep = epi->ep;
750                list_del_init(&epi->fllink);
751                mutex_lock_nested(&ep->mtx, 0);
752                ep_remove(ep, epi);
753                mutex_unlock(&ep->mtx);
754        }
755
756        mutex_unlock(&epmutex);
757}
758
759static int ep_alloc(struct eventpoll **pep)
760{
761        int error;
762        struct user_struct *user;
763        struct eventpoll *ep;
764
765        user = get_current_user();
766        error = -ENOMEM;
767        ep = kzalloc(sizeof(*ep), GFP_KERNEL);
768        if (unlikely(!ep))
769                goto free_uid;
770
771        spin_lock_init(&ep->lock);
772        mutex_init(&ep->mtx);
773        init_waitqueue_head(&ep->wq);
774        init_waitqueue_head(&ep->poll_wait);
775        INIT_LIST_HEAD(&ep->rdllist);
776        ep->rbr = RB_ROOT;
777        ep->ovflist = EP_UNACTIVE_PTR;
778        ep->user = user;
779
780        *pep = ep;
781
782        return 0;
783
784free_uid:
785        free_uid(user);
786        return error;
787}
788
789/*
790 * Search the file inside the eventpoll tree. The RB tree operations
791 * are protected by the "mtx" mutex, and ep_find() must be called with
792 * "mtx" held.
793 */
794static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
795{
796        int kcmp;
797        struct rb_node *rbp;
798        struct epitem *epi, *epir = NULL;
799        struct epoll_filefd ffd;
800
801        ep_set_ffd(&ffd, file, fd);
802        for (rbp = ep->rbr.rb_node; rbp; ) {
803                epi = rb_entry(rbp, struct epitem, rbn);
804                kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
805                if (kcmp > 0)
806                        rbp = rbp->rb_right;
807                else if (kcmp < 0)
808                        rbp = rbp->rb_left;
809                else {
810                        epir = epi;
811                        break;
812                }
813        }
814
815        return epir;
816}
817
818/*
819 * This is the callback that is passed to the wait queue wakeup
820 * mechanism. It is called by the stored file descriptors when they
821 * have events to report.
822 */
823static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
824{
825        int pwake = 0;
826        unsigned long flags;
827        struct epitem *epi = ep_item_from_wait(wait);
828        struct eventpoll *ep = epi->ep;
829
830        spin_lock_irqsave(&ep->lock, flags);
831
832        /*
833         * If the event mask does not contain any poll(2) event, we consider the
834         * descriptor to be disabled. This condition is likely the effect of the
835         * EPOLLONESHOT bit that disables the descriptor when an event is received,
836         * until the next EPOLL_CTL_MOD will be issued.
837         */
838        if (!(epi->event.events & ~EP_PRIVATE_BITS))
839                goto out_unlock;
840
841        /*
842         * Check the events coming with the callback. At this stage, not
843         * every device reports the events in the "key" parameter of the
844         * callback. We need to be able to handle both cases here, hence the
845         * test for "key" != NULL before the event match test.
846         */
847        if (key && !((unsigned long) key & epi->event.events))
848                goto out_unlock;
849
850        /*
851         * If we are transferring events to userspace, we can hold no locks
852         * (because we're accessing user memory, and because of linux f_op->poll()
853         * semantics). All the events that happen during that period of time are
854         * chained in ep->ovflist and requeued later on.
855         */
856        if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
857                if (epi->next == EP_UNACTIVE_PTR) {
858                        epi->next = ep->ovflist;
859                        ep->ovflist = epi;
860                }
861                goto out_unlock;
862        }
863
864        /* If this file is already in the ready list we exit soon */
865        if (!ep_is_linked(&epi->rdllink))
866                list_add_tail(&epi->rdllink, &ep->rdllist);
867
868        /*
869         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
870         * wait list.
871         */
872        if (waitqueue_active(&ep->wq))
873                wake_up_locked(&ep->wq);
874        if (waitqueue_active(&ep->poll_wait))
875                pwake++;
876
877out_unlock:
878        spin_unlock_irqrestore(&ep->lock, flags);
879
880        /* We have to call this outside the lock */
881        if (pwake)
882                ep_poll_safewake(&ep->poll_wait);
883
884        return 1;
885}
886
887/*
888 * This is the callback that is used to add our wait queue to the
889 * target file wakeup lists.
890 */
891static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
892                                 poll_table *pt)
893{
894        struct epitem *epi = ep_item_from_epqueue(pt);
895        struct eppoll_entry *pwq;
896
897        if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
898                init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
899                pwq->whead = whead;
900                pwq->base = epi;
901                add_wait_queue(whead, &pwq->wait);
902                list_add_tail(&pwq->llink, &epi->pwqlist);
903                epi->nwait++;
904        } else {
905                /* We have to signal that an error occurred */
906                epi->nwait = -1;
907        }
908}
909
910static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
911{
912        int kcmp;
913        struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
914        struct epitem *epic;
915
916        while (*p) {
917                parent = *p;
918                epic = rb_entry(parent, struct epitem, rbn);
919                kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
920                if (kcmp > 0)
921                        p = &parent->rb_right;
922                else
923                        p = &parent->rb_left;
924        }
925        rb_link_node(&epi->rbn, parent, p);
926        rb_insert_color(&epi->rbn, &ep->rbr);
927}
928
929/*
930 * Must be called with "mtx" held.
931 */
932static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
933                     struct file *tfile, int fd)
934{
935        int error, revents, pwake = 0;
936        unsigned long flags;
937        long user_watches;
938        struct epitem *epi;
939        struct ep_pqueue epq;
940
941        user_watches = atomic_long_read(&ep->user->epoll_watches);
942        if (unlikely(user_watches >= max_user_watches))
943                return -ENOSPC;
944        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
945                return -ENOMEM;
946
947        /* Item initialization follow here ... */
948        INIT_LIST_HEAD(&epi->rdllink);
949        INIT_LIST_HEAD(&epi->fllink);
950        INIT_LIST_HEAD(&epi->pwqlist);
951        epi->ep = ep;
952        ep_set_ffd(&epi->ffd, tfile, fd);
953        epi->event = *event;
954        epi->nwait = 0;
955        epi->next = EP_UNACTIVE_PTR;
956
957        /* Initialize the poll table using the queue callback */
958        epq.epi = epi;
959        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
960
961        /*
962         * Attach the item to the poll hooks and get current event bits.
963         * We can safely use the file* here because its usage count has
964         * been increased by the caller of this function. Note that after
965         * this operation completes, the poll callback can start hitting
966         * the new item.
967         */
968        revents = tfile->f_op->poll(tfile, &epq.pt);
969
970        /*
971         * We have to check if something went wrong during the poll wait queue
972         * install process. Namely an allocation for a wait queue failed due
973         * high memory pressure.
974         */
975        error = -ENOMEM;
976        if (epi->nwait < 0)
977                goto error_unregister;
978
979        /* Add the current item to the list of active epoll hook for this file */
980        spin_lock(&tfile->f_lock);
981        list_add_tail(&epi->fllink, &tfile->f_ep_links);
982        spin_unlock(&tfile->f_lock);
983
984        /*
985         * Add the current item to the RB tree. All RB tree operations are
986         * protected by "mtx", and ep_insert() is called with "mtx" held.
987         */
988        ep_rbtree_insert(ep, epi);
989
990        /* We have to drop the new item inside our item list to keep track of it */
991        spin_lock_irqsave(&ep->lock, flags);
992
993        /* If the file is already "ready" we drop it inside the ready list */
994        if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
995                list_add_tail(&epi->rdllink, &ep->rdllist);
996
997                /* Notify waiting tasks that events are available */
998                if (waitqueue_active(&ep->wq))
999                        wake_up_locked(&ep->wq);
1000                if (waitqueue_active(&ep->poll_wait))
1001                        pwake++;
1002        }
1003
1004        spin_unlock_irqrestore(&ep->lock, flags);
1005
1006        atomic_long_inc(&ep->user->epoll_watches);
1007
1008        /* We have to call this outside the lock */
1009        if (pwake)
1010                ep_poll_safewake(&ep->poll_wait);
1011
1012        return 0;
1013
1014error_unregister:
1015        ep_unregister_pollwait(ep, epi);
1016
1017        /*
1018         * We need to do this because an event could have been arrived on some
1019         * allocated wait queue. Note that we don't care about the ep->ovflist
1020         * list, since that is used/cleaned only inside a section bound by "mtx".
1021         * And ep_insert() is called with "mtx" held.
1022         */
1023        spin_lock_irqsave(&ep->lock, flags);
1024        if (ep_is_linked(&epi->rdllink))
1025                list_del_init(&epi->rdllink);
1026        spin_unlock_irqrestore(&ep->lock, flags);
1027
1028        kmem_cache_free(epi_cache, epi);
1029
1030        return error;
1031}
1032
1033/*
1034 * Modify the interest event mask by dropping an event if the new mask
1035 * has a match in the current file status. Must be called with "mtx" held.
1036 */
1037static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1038{
1039        int pwake = 0;
1040        unsigned int revents;
1041
1042        /*
1043         * Set the new event interest mask before calling f_op->poll();
1044         * otherwise we might miss an event that happens between the
1045         * f_op->poll() call and the new event set registering.
1046         */
1047        epi->event.events = event->events;
1048        epi->event.data = event->data; /* protected by mtx */
1049
1050        /*
1051         * Get current event bits. We can safely use the file* here because
1052         * its usage count has been increased by the caller of this function.
1053         */
1054        revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
1055
1056        /*
1057         * If the item is "hot" and it is not registered inside the ready
1058         * list, push it inside.
1059         */
1060        if (revents & event->events) {
1061                spin_lock_irq(&ep->lock);
1062                if (!ep_is_linked(&epi->rdllink)) {
1063                        list_add_tail(&epi->rdllink, &ep->rdllist);
1064
1065                        /* Notify waiting tasks that events are available */
1066                        if (waitqueue_active(&ep->wq))
1067                                wake_up_locked(&ep->wq);
1068                        if (waitqueue_active(&ep->poll_wait))
1069                                pwake++;
1070                }
1071                spin_unlock_irq(&ep->lock);
1072        }
1073
1074        /* We have to call this outside the lock */
1075        if (pwake)
1076                ep_poll_safewake(&ep->poll_wait);
1077
1078        return 0;
1079}
1080
1081static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1082                               void *priv)
1083{
1084        struct ep_send_events_data *esed = priv;
1085        int eventcnt;
1086        unsigned int revents;
1087        struct epitem *epi;
1088        struct epoll_event __user *uevent;
1089
1090        /*
1091         * We can loop without lock because we are passed a task private list.
1092         * Items cannot vanish during the loop because ep_scan_ready_list() is
1093         * holding "mtx" during this call.
1094         */
1095        for (eventcnt = 0, uevent = esed->events;
1096             !list_empty(head) && eventcnt < esed->maxevents;) {
1097                epi = list_first_entry(head, struct epitem, rdllink);
1098
1099                list_del_init(&epi->rdllink);
1100
1101                revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
1102                        epi->event.events;
1103
1104                /*
1105                 * If the event mask intersect the caller-requested one,
1106                 * deliver the event to userspace. Again, ep_scan_ready_list()
1107                 * is holding "mtx", so no operations coming from userspace
1108                 * can change the item.
1109                 */
1110                if (revents) {
1111                        if (__put_user(revents, &uevent->events) ||
1112                            __put_user(epi->event.data, &uevent->data)) {
1113                                list_add(&epi->rdllink, head);
1114                                return eventcnt ? eventcnt : -EFAULT;
1115                        }
1116                        eventcnt++;
1117                        uevent++;
1118                        if (epi->event.events & EPOLLONESHOT)
1119                                epi->event.events &= EP_PRIVATE_BITS;
1120                        else if (!(epi->event.events & EPOLLET)) {
1121                                /*
1122                                 * If this file has been added with Level
1123                                 * Trigger mode, we need to insert back inside
1124                                 * the ready list, so that the next call to
1125                                 * epoll_wait() will check again the events
1126                                 * availability. At this point, no one can insert
1127                                 * into ep->rdllist besides us. The epoll_ctl()
1128                                 * callers are locked out by
1129                                 * ep_scan_ready_list() holding "mtx" and the
1130                                 * poll callback will queue them in ep->ovflist.
1131                                 */
1132                                list_add_tail(&epi->rdllink, &ep->rdllist);
1133                        }
1134                }
1135        }
1136
1137        return eventcnt;
1138}
1139
1140static int ep_send_events(struct eventpoll *ep,
1141                          struct epoll_event __user *events, int maxevents)
1142{
1143        struct ep_send_events_data esed;
1144
1145        esed.maxevents = maxevents;
1146        esed.events = events;
1147
1148        return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1149}
1150
1151static inline struct timespec ep_set_mstimeout(long ms)
1152{
1153        struct timespec now, ts = {
1154                .tv_sec = ms / MSEC_PER_SEC,
1155                .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1156        };
1157
1158        ktime_get_ts(&now);
1159        return timespec_add_safe(now, ts);
1160}
1161
1162/**
1163 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
1164 *           event buffer.
1165 *
1166 * @ep: Pointer to the eventpoll context.
1167 * @events: Pointer to the userspace buffer where the ready events should be
1168 *          stored.
1169 * @maxevents: Size (in terms of number of events) of the caller event buffer.
1170 * @timeout: Maximum timeout for the ready events fetch operation, in
1171 *           milliseconds. If the @timeout is zero, the function will not block,
1172 *           while if the @timeout is less than zero, the function will block
1173 *           until at least one event has been retrieved (or an error
1174 *           occurred).
1175 *
1176 * Returns: Returns the number of ready events which have been fetched, or an
1177 *          error code, in case of error.
1178 */
1179static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1180                   int maxevents, long timeout)
1181{
1182        int res = 0, eavail, timed_out = 0;
1183        unsigned long flags;
1184        long slack = 0;
1185        wait_queue_t wait;
1186        ktime_t expires, *to = NULL;
1187
1188        if (timeout > 0) {
1189                struct timespec end_time = ep_set_mstimeout(timeout);
1190
1191                slack = select_estimate_accuracy(&end_time);
1192                to = &expires;
1193                *to = timespec_to_ktime(end_time);
1194        } else if (timeout == 0) {
1195                /*
1196                 * Avoid the unnecessary trip to the wait queue loop, if the
1197                 * caller specified a non blocking operation.
1198                 */
1199                timed_out = 1;
1200                spin_lock_irqsave(&ep->lock, flags);
1201                goto check_events;
1202        }
1203
1204fetch_events:
1205        spin_lock_irqsave(&ep->lock, flags);
1206
1207        if (!ep_events_available(ep)) {
1208                /*
1209                 * We don't have any available event to return to the caller.
1210                 * We need to sleep here, and we will be wake up by
1211                 * ep_poll_callback() when events will become available.
1212                 */
1213                init_waitqueue_entry(&wait, current);
1214                __add_wait_queue_exclusive(&ep->wq, &wait);
1215
1216                for (;;) {
1217                        /*
1218                         * We don't want to sleep if the ep_poll_callback() sends us
1219                         * a wakeup in between. That's why we set the task state
1220                         * to TASK_INTERRUPTIBLE before doing the checks.
1221                         */
1222                        set_current_state(TASK_INTERRUPTIBLE);
1223                        if (ep_events_available(ep) || timed_out)
1224                                break;
1225                        if (signal_pending(current)) {
1226                                res = -EINTR;
1227                                break;
1228                        }
1229
1230                        spin_unlock_irqrestore(&ep->lock, flags);
1231                        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1232                                timed_out = 1;
1233
1234                        spin_lock_irqsave(&ep->lock, flags);
1235                }
1236                __remove_wait_queue(&ep->wq, &wait);
1237
1238                set_current_state(TASK_RUNNING);
1239        }
1240check_events:
1241        /* Is it worth to try to dig for events ? */
1242        eavail = ep_events_available(ep);
1243
1244        spin_unlock_irqrestore(&ep->lock, flags);
1245
1246        /*
1247         * Try to transfer events to user space. In case we get 0 events and
1248         * there's still timeout left over, we go trying again in search of
1249         * more luck.
1250         */
1251        if (!res && eavail &&
1252            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1253                goto fetch_events;
1254
1255        return res;
1256}
1257
1258/**
1259 * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
1260 *                      API, to verify that adding an epoll file inside another
1261 *                      epoll structure, does not violate the constraints, in
1262 *                      terms of closed loops, or too deep chains (which can
1263 *                      result in excessive stack usage).
1264 *
1265 * @priv: Pointer to the epoll file to be currently checked.
1266 * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
1267 *          data structure pointer.
1268 * @call_nests: Current dept of the @ep_call_nested() call stack.
1269 *
1270 * Returns: Returns zero if adding the epoll @file inside current epoll
1271 *          structure @ep does not violate the constraints, or -1 otherwise.
1272 */
1273static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1274{
1275        int error = 0;
1276        struct file *file = priv;
1277        struct eventpoll *ep = file->private_data;
1278        struct rb_node *rbp;
1279        struct epitem *epi;
1280
1281        mutex_lock_nested(&ep->mtx, call_nests + 1);
1282        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1283                epi = rb_entry(rbp, struct epitem, rbn);
1284                if (unlikely(is_file_epoll(epi->ffd.file))) {
1285                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1286                                               ep_loop_check_proc, epi->ffd.file,
1287                                               epi->ffd.file->private_data, current);
1288                        if (error != 0)
1289                                break;
1290                }
1291        }
1292        mutex_unlock(&ep->mtx);
1293
1294        return error;
1295}
1296
1297/**
1298 * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
1299 *                 another epoll file (represented by @ep) does not create
1300 *                 closed loops or too deep chains.
1301 *
1302 * @ep: Pointer to the epoll private data structure.
1303 * @file: Pointer to the epoll file to be checked.
1304 *
1305 * Returns: Returns zero if adding the epoll @file inside current epoll
1306 *          structure @ep does not violate the constraints, or -1 otherwise.
1307 */
1308static int ep_loop_check(struct eventpoll *ep, struct file *file)
1309{
1310        return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1311                              ep_loop_check_proc, file, ep, current);
1312}
1313
1314/*
1315 * Open an eventpoll file descriptor.
1316 */
1317SYSCALL_DEFINE1(epoll_create1, int, flags)
1318{
1319        int error;
1320        struct eventpoll *ep = NULL;
1321
1322        /* Check the EPOLL_* constant for consistency.  */
1323        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1324
1325        if (flags & ~EPOLL_CLOEXEC)
1326                return -EINVAL;
1327        /*
1328         * Create the internal data structure ("struct eventpoll").
1329         */
1330        error = ep_alloc(&ep);
1331        if (error < 0)
1332                return error;
1333        /*
1334         * Creates all the items needed to setup an eventpoll file. That is,
1335         * a file structure and a free file descriptor.
1336         */
1337        error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1338                                 O_RDWR | (flags & O_CLOEXEC));
1339        if (error < 0)
1340                ep_free(ep);
1341
1342        return error;
1343}
1344
1345SYSCALL_DEFINE1(epoll_create, int, size)
1346{
1347        if (size <= 0)
1348                return -EINVAL;
1349
1350        return sys_epoll_create1(0);
1351}
1352
1353/*
1354 * The following function implements the controller interface for
1355 * the eventpoll file that enables the insertion/removal/change of
1356 * file descriptors inside the interest set.
1357 */
1358SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1359                struct epoll_event __user *, event)
1360{
1361        int error;
1362        int did_lock_epmutex = 0;
1363        struct file *file, *tfile;
1364        struct eventpoll *ep;
1365        struct epitem *epi;
1366        struct epoll_event epds;
1367
1368        error = -EFAULT;
1369        if (ep_op_has_event(op) &&
1370            copy_from_user(&epds, event, sizeof(struct epoll_event)))
1371                goto error_return;
1372
1373        /* Get the "struct file *" for the eventpoll file */
1374        error = -EBADF;
1375        file = fget(epfd);
1376        if (!file)
1377                goto error_return;
1378
1379        /* Get the "struct file *" for the target file */
1380        tfile = fget(fd);
1381        if (!tfile)
1382                goto error_fput;
1383
1384        /* The target file descriptor must support poll */
1385        error = -EPERM;
1386        if (!tfile->f_op || !tfile->f_op->poll)
1387                goto error_tgt_fput;
1388
1389        /*
1390         * We have to check that the file structure underneath the file descriptor
1391         * the user passed to us _is_ an eventpoll file. And also we do not permit
1392         * adding an epoll file descriptor inside itself.
1393         */
1394        error = -EINVAL;
1395        if (file == tfile || !is_file_epoll(file))
1396                goto error_tgt_fput;
1397
1398        /*
1399         * At this point it is safe to assume that the "private_data" contains
1400         * our own data structure.
1401         */
1402        ep = file->private_data;
1403
1404        /*
1405         * When we insert an epoll file descriptor, inside another epoll file
1406         * descriptor, there is the change of creating closed loops, which are
1407         * better be handled here, than in more critical paths.
1408         *
1409         * We hold epmutex across the loop check and the insert in this case, in
1410         * order to prevent two separate inserts from racing and each doing the
1411         * insert "at the same time" such that ep_loop_check passes on both
1412         * before either one does the insert, thereby creating a cycle.
1413         */
1414        if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
1415                mutex_lock(&epmutex);
1416                did_lock_epmutex = 1;
1417                error = -ELOOP;
1418                if (ep_loop_check(ep, tfile) != 0)
1419                        goto error_tgt_fput;
1420        }
1421
1422
1423        mutex_lock_nested(&ep->mtx, 0);
1424
1425        /*
1426         * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
1427         * above, we can be sure to be able to use the item looked up by
1428         * ep_find() till we release the mutex.
1429         */
1430        epi = ep_find(ep, tfile, fd);
1431
1432        error = -EINVAL;
1433        switch (op) {
1434        case EPOLL_CTL_ADD:
1435                if (!epi) {
1436                        epds.events |= POLLERR | POLLHUP;
1437                        error = ep_insert(ep, &epds, tfile, fd);
1438                } else
1439                        error = -EEXIST;
1440                break;
1441        case EPOLL_CTL_DEL:
1442                if (epi)
1443                        error = ep_remove(ep, epi);
1444                else
1445                        error = -ENOENT;
1446                break;
1447        case EPOLL_CTL_MOD:
1448                if (epi) {
1449                        epds.events |= POLLERR | POLLHUP;
1450                        error = ep_modify(ep, epi, &epds);
1451                } else
1452                        error = -ENOENT;
1453                break;
1454        }
1455        mutex_unlock(&ep->mtx);
1456
1457error_tgt_fput:
1458        if (unlikely(did_lock_epmutex))
1459                mutex_unlock(&epmutex);
1460
1461        fput(tfile);
1462error_fput:
1463        fput(file);
1464error_return:
1465
1466        return error;
1467}
1468
1469/*
1470 * Implement the event wait interface for the eventpoll file. It is the kernel
1471 * part of the user space epoll_wait(2).
1472 */
1473SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1474                int, maxevents, int, timeout)
1475{
1476        int error;
1477        struct file *file;
1478        struct eventpoll *ep;
1479
1480        /* The maximum number of event must be greater than zero */
1481        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1482                return -EINVAL;
1483
1484        /* Verify that the area passed by the user is writeable */
1485        if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1486                error = -EFAULT;
1487                goto error_return;
1488        }
1489
1490        /* Get the "struct file *" for the eventpoll file */
1491        error = -EBADF;
1492        file = fget(epfd);
1493        if (!file)
1494                goto error_return;
1495
1496        /*
1497         * We have to check that the file structure underneath the fd
1498         * the user passed to us _is_ an eventpoll file.
1499         */
1500        error = -EINVAL;
1501        if (!is_file_epoll(file))
1502                goto error_fput;
1503
1504        /*
1505         * At this point it is safe to assume that the "private_data" contains
1506         * our own data structure.
1507         */
1508        ep = file->private_data;
1509
1510        /* Time to fish for events ... */
1511        error = ep_poll(ep, events, maxevents, timeout);
1512
1513error_fput:
1514        fput(file);
1515error_return:
1516
1517        return error;
1518}
1519
1520#ifdef HAVE_SET_RESTORE_SIGMASK
1521
1522/*
1523 * Implement the event wait interface for the eventpoll file. It is the kernel
1524 * part of the user space epoll_pwait(2).
1525 */
1526SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1527                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
1528                size_t, sigsetsize)
1529{
1530        int error;
1531        sigset_t ksigmask, sigsaved;
1532
1533        /*
1534         * If the caller wants a certain signal mask to be set during the wait,
1535         * we apply it here.
1536         */
1537        if (sigmask) {
1538                if (sigsetsize != sizeof(sigset_t))
1539                        return -EINVAL;
1540                if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1541                        return -EFAULT;
1542                sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1543                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1544        }
1545
1546        error = sys_epoll_wait(epfd, events, maxevents, timeout);
1547
1548        /*
1549         * If we changed the signal mask, we need to restore the original one.
1550         * In case we've got a signal while waiting, we do not restore the
1551         * signal mask yet, and we allow do_signal() to deliver the signal on
1552         * the way back to userspace, before the signal mask is restored.
1553         */
1554        if (sigmask) {
1555                if (error == -EINTR) {
1556                        memcpy(&current->saved_sigmask, &sigsaved,
1557                               sizeof(sigsaved));
1558                        set_restore_sigmask();
1559                } else
1560                        sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1561        }
1562
1563        return error;
1564}
1565
1566#endif /* HAVE_SET_RESTORE_SIGMASK */
1567
1568static int __init eventpoll_init(void)
1569{
1570        struct sysinfo si;
1571
1572        si_meminfo(&si);
1573        /*
1574         * Allows top 4% of lomem to be allocated for epoll watches (per user).
1575         */
1576        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1577                EP_ITEM_COST;
1578        BUG_ON(max_user_watches < 0);
1579
1580        /*
1581         * Initialize the structure used to perform epoll file descriptor
1582         * inclusion loops checks.
1583         */
1584        ep_nested_calls_init(&poll_loop_ncalls);
1585
1586        /* Initialize the structure used to perform safe poll wait head wake ups */
1587        ep_nested_calls_init(&poll_safewake_ncalls);
1588
1589        /* Initialize the structure used to perform file's f_op->poll() calls */
1590        ep_nested_calls_init(&poll_readywalk_ncalls);
1591
1592        /* Allocates slab cache used to allocate "struct epitem" items */
1593        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1594                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1595
1596        /* Allocates slab cache used to allocate "struct eppoll_entry" */
1597        pwq_cache = kmem_cache_create("eventpoll_pwq",
1598                        sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1599
1600        return 0;
1601}
1602fs_initcall(eventpoll_init);
Note: See TracBrowser for help on using the repository browser.