diff -Nru linux-2.4.18.vanilla/Documentation/Configure.help linux-2.4.18.epoll/Documentation/Configure.help --- linux-2.4.18.vanilla/Documentation/Configure.help Mon Feb 25 11:37:51 2002 +++ linux-2.4.18.epoll/Documentation/Configure.help Wed Mar 27 19:27:42 2002 @@ -17760,6 +17760,26 @@ contains more information and the location of the joystick package that you'll need. +/dev/epoll support +CONFIG_EVENTPOLL + This option will allow for the creation of a '/dev/epoll' character + device, with major number 10 (MISC_MAJOR) and minor number 124 + (EVENTPOLL_MINOR). + + This device can be used to very efficiently handle incoming events on a + socket, much more so than select() or poll(). There is a paper that + describes this device and how to program for it (as well as including + some very impressive benchmarks) at the following URL: + http://www.xmailserver.org/linux-patches/nio-improve.html + + If you are writing very scalable servers and wish to code against + /dev/epoll for enhanced speed, say 'Y' or 'M' here. If you have + software in hand that requires (or can make use of) /dev/epoll, + also say 'Y' or 'M' here. + + The vast majority of the planet can very safely say 'N' here + and breathe easily. + Game port support CONFIG_INPUT_GAMEPORT Gameport support is for the standard 15-pin PC gameport. If you diff -Nru linux-2.4.18.vanilla/Makefile linux-2.4.18.epoll/Makefile --- linux-2.4.18.vanilla/Makefile Mon Feb 25 11:37:52 2002 +++ linux-2.4.18.epoll/Makefile Wed Mar 27 19:27:42 2002 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 18 -EXTRAVERSION = +EXTRAVERSION = epoll KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nru linux-2.4.18.vanilla/drivers/char/Config.in linux-2.4.18.epoll/drivers/char/Config.in --- linux-2.4.18.vanilla/drivers/char/Config.in Mon Feb 25 11:37:57 2002 +++ linux-2.4.18.epoll/drivers/char/Config.in Wed Mar 27 19:27:42 2002 @@ -185,6 +185,7 @@ dep_tristate 'Intel i8x0 Random Number Generator support' CONFIG_INTEL_RNG $CONFIG_PCI tristate '/dev/nvram support' CONFIG_NVRAM +tristate '/dev/epoll - Efficent file event polling method' CONFIG_EVENTPOLL tristate 'Enhanced Real Time Clock Support' CONFIG_RTC if [ "$CONFIG_IA64" = "y" ]; then bool 'EFI Real Time Clock Services' CONFIG_EFI_RTC diff -Nru linux-2.4.18.vanilla/drivers/char/Makefile linux-2.4.18.epoll/drivers/char/Makefile --- linux-2.4.18.vanilla/drivers/char/Makefile Mon Feb 25 11:37:57 2002 +++ linux-2.4.18.epoll/drivers/char/Makefile Wed Mar 27 19:27:42 2002 @@ -192,6 +192,7 @@ ifeq ($(CONFIG_PPC),) obj-$(CONFIG_NVRAM) += nvram.o endif +obj-$(CONFIG_EVENTPOLL) += eventpoll.o obj-$(CONFIG_TOSHIBA) += toshiba.o obj-$(CONFIG_I8K) += i8k.o obj-$(CONFIG_DS1620) += ds1620.o diff -Nru linux-2.4.18.vanilla/drivers/char/eventpoll.c linux-2.4.18.epoll/drivers/char/eventpoll.c --- linux-2.4.18.vanilla/drivers/char/eventpoll.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.18.epoll/drivers/char/eventpoll.c Wed Mar 27 19:37:19 2002 @@ -0,0 +1,800 @@ +/* + * drivers/char/eventpoll.c + * + * Copyright (C) 2001, Davide Libenzi + * + * Efficent event polling implementation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + + + + +#define DEBUG 0 +#ifdef DEBUG +#define DPRINTK(x) printk x +#define DNPRINTK(n,x) if (n <= DEBUG) printk x +#else +#define DPRINTK(x) +#define DNPRINTK(n,x) +#endif + +#define DEBUG_DPI 0 + +#if DEBUG_DPI +#define DPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) +#else +#define DPI_SLAB_DEBUG 0 +#endif + +#define INITIAL_HASH_BITS 7 +#define MAX_HASH_BITS 18 +#define RESIZE_LENGTH 2 + +#define dpi_mem_alloc() (struct epitem *) kmem_cache_alloc(dpi_cache, SLAB_KERNEL) +#define dpi_mem_free(p) kmem_cache_free(dpi_cache, p) + + + + + +typedef unsigned long long event_version_t; + +struct eventpoll { + rwlock_t lock; + wait_queue_head_t wq; + wait_queue_head_t poll_wait; + struct list_head *hash; + unsigned int hbits; + unsigned int hmask; + atomic_t hents; + atomic_t resize; + int numpages; + char **pages; + char *pages0[MAX_EVENTPOLL_PAGES]; + char *pages1[MAX_EVENTPOLL_PAGES]; + atomic_t mmapped; + int eventcnt; + event_version_t ver; +}; + +struct epitem { + struct list_head llink; + struct eventpoll *ep; + struct file *file; + struct pollfd pfd; + int index; + event_version_t ver; +}; + + + + + + +static int ep_alloc_pages(char **pages, int numpages); +static int ep_free_pages(char **pages, int numpages); +static int ep_init(struct eventpoll *ep); +static void ep_free(struct eventpoll *ep); +static inline struct epitem *ep_find_nl(struct eventpoll *ep, int fd); +static struct epitem *ep_find(struct eventpoll *ep, int fd); +static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags); +static int ep_insert(struct eventpoll *ep, struct pollfd *pfd); +static int ep_remove(struct eventpoll *ep, struct epitem *dpi); +static void notify_proc(struct file *file, void *data, unsigned long *local, long *event); +static int open_eventpoll(struct inode *inode, struct file *file); +static int close_eventpoll(struct inode *inode, struct file *file); +static unsigned int poll_eventpoll(struct file *file, poll_table *wait); +static int write_eventpoll(struct file *file, const char *buffer, size_t count, + loff_t *ppos); +static int ep_poll(struct eventpoll *ep, void *arg); +static int ioctl_eventpoll(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +static void eventpoll_mm_open(struct vm_area_struct * vma); +static void eventpoll_mm_close(struct vm_area_struct * vma); +static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma); + + + + +static kmem_cache_t *dpi_cache; + +static struct file_operations eventpoll_fops = { + write: write_eventpoll, + ioctl: ioctl_eventpoll, + mmap: mmap_eventpoll, + open: open_eventpoll, + release: close_eventpoll, + poll: poll_eventpoll +}; + +static struct vm_operations_struct eventpoll_mmap_ops = { + open: eventpoll_mm_open, + close: eventpoll_mm_close, +}; + +static struct miscdevice eventpoll = { + EVENTPOLL_MINOR, "eventpoll", &eventpoll_fops +}; + + + + +static int ep_alloc_pages(char **pages, int numpages) +{ + int ii; + + for (ii = 0; ii < numpages; ii++) { + pages[ii] = (char *) __get_free_pages(GFP_KERNEL, 0); + if (!pages[ii]) { + for (--ii; ii >= 0; ii--) { + clear_bit(PG_reserved, &virt_to_page(pages[ii])->flags); + free_pages((unsigned long) pages[ii], 0); + } + return -ENOMEM; + } + set_bit(PG_reserved, &virt_to_page(pages[ii])->flags); + } + return 0; +} + + +static int ep_free_pages(char **pages, int numpages) +{ + int ii; + + for (ii = 0; ii < numpages; ii++) { + clear_bit(PG_reserved, &virt_to_page(pages[ii])->flags); + free_pages((unsigned long) pages[ii], 0); + } + return 0; +} + + +static int ep_init(struct eventpoll *ep) +{ + int ii, hentries; + + rwlock_init(&ep->lock); + init_waitqueue_head(&ep->wq); + init_waitqueue_head(&ep->poll_wait); + ep->hbits = INITIAL_HASH_BITS; + ep->hmask = (1 << ep->hbits) - 1; + atomic_set(&ep->hents, 0); + atomic_set(&ep->resize, 0); + atomic_set(&ep->mmapped, 0); + ep->numpages = 0; + ep->pages = ep->pages0; + ep->eventcnt = 0; + ep->ver = 1; + + hentries = ep->hmask + 1; + if (!(ep->hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) + return -ENOMEM; + + for (ii = 0; ii < hentries; ii++) + INIT_LIST_HEAD(&ep->hash[ii]); + + return 0; +} + + +static void ep_free(struct eventpoll *ep) +{ + int ii; + struct list_head *lnk; + + lock_kernel(); + for (ii = 0; ii <= ep->hmask; ii++) { + while ((lnk = list_first(&ep->hash[ii]))) { + struct epitem *dpi = list_entry(lnk, struct epitem, llink); + + file_notify_delcb(dpi->file, notify_proc); + list_del(lnk); + dpi_mem_free(dpi); + } + } + vfree(ep->hash); + if (ep->numpages > 0) { + ep_free_pages(ep->pages0, ep->numpages); + ep_free_pages(ep->pages1, ep->numpages); + } + unlock_kernel(); +} + + +static inline struct epitem *ep_find_nl(struct eventpoll *ep, int fd) +{ + struct epitem *dpi = NULL; + struct list_head *lsthead, *lnk; + + lsthead = &ep->hash[fd & ep->hmask]; + list_for_each(lnk, lsthead) { + dpi = list_entry(lnk, struct epitem, llink); + + if (dpi->pfd.fd == fd) break; + dpi = NULL; + } + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_find(%d) -> %p\n", current, fd, dpi)); + + return dpi; +} + + +static struct epitem *ep_find(struct eventpoll *ep, int fd) +{ + struct epitem *dpi; + unsigned long flags; + + read_lock_irqsave(&ep->lock, flags); + + dpi = ep_find_nl(ep, fd); + + read_unlock_irqrestore(&ep->lock, flags); + + return dpi; +} + + +static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags) +{ + struct list_head *hash, *oldhash; + unsigned int hbits = ep->hbits + 1; + unsigned int hmask = (1 << hbits) - 1; + int ii, res, hentries = hmask + 1; + unsigned long flags = *kflags; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_hashresize(%p) bits=%u\n", current, ep, hbits)); + + write_unlock_irqrestore(&ep->lock, flags); + + res = -ENOMEM; + if (!(hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) { + write_lock_irqsave(&ep->lock, flags); + goto out; + } + + for (ii = 0; ii < hentries; ii++) + INIT_LIST_HEAD(&hash[ii]); + + write_lock_irqsave(&ep->lock, flags); + + oldhash = ep->hash; + for (ii = 0; ii <= ep->hmask; ii++) { + struct list_head *oldhead = &oldhash[ii], *lnk; + + while ((lnk = list_first(oldhead))) { + struct epitem *dpi = list_entry(lnk, struct epitem, llink); + + list_del(lnk); + list_add(lnk, &hash[dpi->pfd.fd & hmask]); + } + } + + ep->hash = hash; + ep->hbits = hbits; + ep->hmask = hmask; + + write_unlock_irqrestore(&ep->lock, flags); + vfree(oldhash); + write_lock_irqsave(&ep->lock, flags); + + res = 0; +out: + *kflags = flags; + atomic_dec(&ep->resize); + return res; +} + + +static int ep_insert(struct eventpoll *ep, struct pollfd *pfd) +{ + struct epitem *dpi; + struct file *file; + unsigned long flags; + + if (atomic_read(&ep->hents) >= (ep->numpages * POLLFD_X_PAGE)) + return -E2BIG; + + if (!(file = fcheck(pfd->fd))) + return -EINVAL; + + if (!(dpi = dpi_mem_alloc())) + return -ENOMEM; + + INIT_LIST_HEAD(&dpi->llink); + dpi->ep = ep; + dpi->file = file; + dpi->pfd = *pfd; + dpi->index = -1; + dpi->ver = ep->ver - 1; + + write_lock_irqsave(&ep->lock, flags); + + list_add(&dpi->llink, &ep->hash[pfd->fd & ep->hmask]); + atomic_inc(&ep->hents); + + if (!atomic_read(&ep->resize) && + (atomic_read(&ep->hents) >> ep->hbits) > RESIZE_LENGTH && + ep->hbits < MAX_HASH_BITS) { + atomic_inc(&ep->resize); + ep_hashresize(ep, &flags); + } + + write_unlock_irqrestore(&ep->lock, flags); + + file_notify_addcb(file, notify_proc, dpi); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_insert(%p, %d)\n", current, ep, pfd->fd)); + + return 0; +} + + +static int ep_remove(struct eventpoll *ep, struct epitem *dpi) +{ + int fd = dpi->pfd.fd; + unsigned long flags; + struct pollfd *pfd, *lpfd; + struct epitem *ldpi; + + file_notify_delcb(dpi->file, notify_proc); + + write_lock_irqsave(&ep->lock, flags); + + list_del(&dpi->llink); + atomic_dec(&ep->hents); + + if (dpi->index >= 0 && dpi->ver == ep->ver && dpi->index < ep->eventcnt) { + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + if (pfd->fd == dpi->pfd.fd && dpi->index < --ep->eventcnt) { + lpfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(ep->eventcnt)] + + EVENT_PAGE_OFFSET(ep->eventcnt)); + *pfd = *lpfd; + + if ((ldpi = ep_find_nl(ep, pfd->fd))) ldpi->index = dpi->index; + } + } + + write_unlock_irqrestore(&ep->lock, flags); + + dpi_mem_free(dpi); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ep_remove(%p, %d)\n", current, ep, fd)); + + return 0; +} + + +static void notify_proc(struct file *file, void *data, unsigned long *local, long *event) +{ + struct epitem *dpi = (struct epitem *) data; + struct eventpoll *ep = dpi->ep; + struct pollfd *pfd; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: notify(%p, %p, %ld, %ld) ep=%p\n", + current, file, data, event[0], event[1], ep)); + + write_lock(&ep->lock); + if (!(dpi->pfd.events & event[1])) + goto out; + + if (dpi->index < 0 || dpi->ver != ep->ver) { + if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE)) + goto out; + dpi->index = ep->eventcnt++; + dpi->ver = ep->ver; + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + *pfd = dpi->pfd; + } else { + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + if (pfd->fd != dpi->pfd.fd) { + if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE)) + goto out; + dpi->index = ep->eventcnt++; + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + *pfd = dpi->pfd; + } + } + + pfd->revents |= (pfd->events & event[1]); + + if (waitqueue_active(&ep->wq)) + wake_up(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + wake_up(&ep->poll_wait); +out: + write_unlock(&ep->lock); +} + + +static int open_eventpoll(struct inode *inode, struct file *file) +{ + int res; + struct eventpoll *ep; + + if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL))) + return -ENOMEM; + + memset(ep, 0, sizeof(*ep)); + if ((res = ep_init(ep))) { + kfree(ep); + return res; + } + + file->private_data = ep; + + MOD_INC_USE_COUNT; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: open() ep=%p\n", current, ep)); + return 0; +} + + +static int close_eventpoll(struct inode *inode, struct file *file) +{ + struct eventpoll *ep = file->private_data; + + ep_free(ep); + + kfree(ep); + + MOD_DEC_USE_COUNT; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: close() ep=%p\n", current, ep)); + return 0; +} + + +static unsigned int poll_eventpoll(struct file *file, poll_table *wait) +{ + struct eventpoll *ep = file->private_data; + + poll_wait(file, &ep->poll_wait, wait); + if (ep->eventcnt) + return POLLIN | POLLRDNORM; + + return 0; +} + + +static int write_eventpoll(struct file *file, const char *buffer, size_t count, + loff_t *ppos) +{ + int res, rcount; + struct eventpoll *ep = file->private_data; + struct epitem *dpi; + struct pollfd pfd; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: write(%p, %d)\n", current, ep, count)); + + if (count % sizeof(struct pollfd)) + return -EINVAL; + + if ((res = verify_area(VERIFY_READ, buffer, count))) + return res; + + rcount = 0; + + lock_kernel(); + + while (count > 0) { + __copy_from_user(&pfd, buffer, sizeof(pfd)); + + dpi = ep_find(ep, pfd.fd); + + if (pfd.fd >= current->files->max_fds || !current->files->fd[pfd.fd]) + pfd.events = POLLREMOVE; + if (pfd.events & POLLREMOVE) { + if (dpi) { + ep_remove(ep, dpi); + rcount += sizeof(pfd); + } + } + else if (dpi) { + dpi->pfd.events = pfd.events; + rcount += sizeof(pfd); + } else { + pfd.revents = 0; + if (!ep_insert(ep, &pfd)) + rcount += sizeof(pfd); + } + + buffer += sizeof(pfd); + count -= sizeof(pfd); + } + + unlock_kernel(); + + return rcount; +} + + +static int ep_poll(struct eventpoll *ep, void *arg) +{ + int res = 0; + long timeout; + unsigned long flags; + struct evpoll dvp; + wait_queue_t wait; + + if (copy_from_user(&dvp, arg, sizeof(struct evpoll))) + return -EFAULT; + + if (!atomic_read(&ep->mmapped)) + return -EINVAL; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_POLL, %d)\n", current, ep, dvp.ep_timeout)); + + write_lock_irqsave(&ep->lock, flags); + + res = 0; + if (!ep->eventcnt) { + init_waitqueue_entry(&wait, current); + add_wait_queue(&ep->wq, &wait); + timeout = dvp.ep_timeout == -1 || dvp.ep_timeout > MAX_SCHEDULE_TIMEOUT/HZ ? + MAX_SCHEDULE_TIMEOUT: (dvp.ep_timeout * HZ) / 1000; + for (;;) { + if (ep->eventcnt || !timeout) + break; + if (signal_pending(current)) { + res = -EINTR; + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + + write_unlock_irqrestore(&ep->lock, flags); + timeout = schedule_timeout(timeout); + write_lock_irqsave(&ep->lock, flags); + } + remove_wait_queue(&ep->wq, &wait); + + set_current_state(TASK_RUNNING); + } + + if (!res && ep->eventcnt) { + res = ep->eventcnt; + ep->eventcnt = 0; + ++ep->ver; + if (ep->pages == ep->pages0) { + ep->pages = ep->pages1; + dvp.ep_resoff = 0; + } else { + ep->pages = ep->pages0; + dvp.ep_resoff = ep->numpages * PAGE_SIZE; + } + } + + write_unlock_irqrestore(&ep->lock, flags); + + if (res > 0) + copy_to_user(arg, &dvp, sizeof(struct evpoll)); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_POLL, %d) == %d\n", current, ep, dvp.ep_timeout, res)); + return res; +} + + +static int ioctl_eventpoll(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int res, numpages; + struct eventpoll *ep = file->private_data; + struct epitem *dpi; + unsigned long flags; + struct pollfd pfd; + + switch (cmd) { + case EP_ALLOC: + if (atomic_read(&ep->mmapped)) + return -EBUSY; + + numpages = EP_FDS_PAGES(arg); + if (numpages > MAX_EVENTPOLL_PAGES) + return -EINVAL; + + res = 0; + write_lock_irqsave(&ep->lock, flags); + if (numpages > ep->numpages) { + if (!(res = ep_alloc_pages(&ep->pages0[ep->numpages], numpages - ep->numpages))) { + if (!(res = ep_alloc_pages(&ep->pages1[ep->numpages], numpages - ep->numpages))) { + ep->numpages = numpages; + } else { + ep_free_pages(&ep->pages0[ep->numpages], numpages - ep->numpages); + } + } + } + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_ALLOC, %lu) == %d\n", + current, ep, arg, res)); + return res; + + case EP_FREE: + if (atomic_read(&ep->mmapped)) + return -EBUSY; + + res = -EINVAL; + write_lock_irqsave(&ep->lock, flags); + if (ep->numpages > 0) { + ep_free_pages(ep->pages0, ep->numpages); + ep_free_pages(ep->pages1, ep->numpages); + ep->numpages = 0; + ep->pages = ep->pages0; + res = 0; + } + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_FREE) == %d\n", + current, ep, res)); + return res; + + case EP_POLL: + return ep_poll(ep, (void *) arg); + + case EP_ISPOLLED: + if (copy_from_user(&pfd, (void *) arg, sizeof(struct pollfd))) + return 0; + + read_lock_irqsave(&ep->lock, flags); + + res = 0; + if (!(dpi = ep_find_nl(ep, pfd.fd))) + goto out_ispolled; + + pfd = dpi->pfd; + res = 1; + + out_ispolled: + read_unlock_irqrestore(&ep->lock, flags); + + if (res) + copy_to_user((void *) arg, &pfd, sizeof(struct pollfd)); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: ioctl(%p, EP_ISPOLLED, %d) == %d\n", + current, ep, pfd.fd, res)); + return res; + } + + return -EINVAL; +} + + +static void eventpoll_mm_open(struct vm_area_struct * vma) +{ + struct file *file = vma->vm_file; + struct eventpoll *ep = file->private_data; + + if (ep) atomic_inc(&ep->mmapped); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mm_open(%p)\n", current, ep)); +} + + +static void eventpoll_mm_close(struct vm_area_struct * vma) +{ + struct file *file = vma->vm_file; + struct eventpoll *ep = file->private_data; + + if (ep) atomic_dec(&ep->mmapped); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mm_close(%p)\n", current, ep)); +} + + +static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma) +{ + struct eventpoll *ep = file->private_data; + unsigned long start, flags; + int ii, res; + int numpages; + size_t mapsize; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mmap(%p, %lx, %lx)\n", + current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT)); + + if ((vma->vm_pgoff << PAGE_SHIFT) != 0) + return -EINVAL; + + mapsize = PAGE_ALIGN(vma->vm_end - vma->vm_start); + numpages = mapsize >> PAGE_SHIFT; + + write_lock_irqsave(&ep->lock, flags); + + res = -EINVAL; + if (numpages != (2 * ep->numpages)) + goto out; + + start = vma->vm_start; + for (ii = 0; ii < ep->numpages; ii++) { + if (remap_page_range(start, __pa(ep->pages0[ii]), + PAGE_SIZE, vma->vm_page_prot)) + goto out; + start += PAGE_SIZE; + } + for (ii = 0; ii < ep->numpages; ii++) { + if (remap_page_range(start, __pa(ep->pages1[ii]), + PAGE_SIZE, vma->vm_page_prot)) + goto out; + start += PAGE_SIZE; + } + vma->vm_ops = &eventpoll_mmap_ops; + atomic_set(&ep->mmapped, 1); + res = 0; +out: + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/epoll: mmap(%p, %lx, %lx) == %d\n", + current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, res)); + return res; +} + + +int __init eventpoll_init(void) +{ + dpi_cache = kmem_cache_create("eventpoll", + sizeof(struct epitem), + __alignof__(struct epitem), + DPI_SLAB_DEBUG, NULL, NULL); + if (!dpi_cache) { + printk(KERN_INFO "[%p] /dev/epoll: driver install failed.\n", current); + return -ENOMEM; + } + + printk(KERN_INFO "[%p] /dev/epoll: driver installed.\n", current); + + misc_register(&eventpoll); + + return 0; +} + + +module_init(eventpoll_init); + +#ifdef MODULE + +void cleanup_module(void) +{ + misc_deregister(&eventpoll); + kmem_cache_destroy(dpi_cache); +} + +#endif + +MODULE_LICENSE("GPL"); + diff -Nru linux-2.4.18.vanilla/fs/Makefile linux-2.4.18.epoll/fs/Makefile --- linux-2.4.18.vanilla/fs/Makefile Mon Feb 25 11:38:07 2002 +++ linux-2.4.18.epoll/fs/Makefile Wed Mar 27 20:00:03 2002 @@ -7,12 +7,12 @@ O_TARGET := fs.o -export-objs := filesystems.o open.o dcache.o buffer.o +export-objs := filesystems.o open.o dcache.o buffer.o fcblist.o mod-subdirs := nls obj-y := open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ - fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ + fcntl.o ioctl.o readdir.o select.o fifo.o locks.o fcblist.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ filesystems.o namespace.o seq_file.o diff -Nru linux-2.4.18.vanilla/fs/fcblist.c linux-2.4.18.epoll/fs/fcblist.c --- linux-2.4.18.vanilla/fs/fcblist.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.18.epoll/fs/fcblist.c Wed Mar 27 19:43:54 2002 @@ -0,0 +1,130 @@ +/* + * linux/fs/fcblist.c + * + * Copyright (C) 2001, Davide Libenzi + * + * Handle file callbacks + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +long ion_band_table[NSIGPOLL] = { + ION_IN, /* POLL_IN */ + ION_OUT, /* POLL_OUT */ + ION_IN, /* POLL_MSG */ + ION_ERR, /* POLL_ERR */ + 0, /* POLL_PRI */ + ION_HUP /* POLL_HUP */ +}; +EXPORT_SYMBOL(ion_band_table); + +long poll_band_table[NSIGPOLL] = { + POLLIN | POLLRDNORM, /* POLL_IN */ + POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */ + POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */ + POLLERR, /* POLL_ERR */ + POLLPRI | POLLRDBAND, /* POLL_PRI */ + POLLHUP | POLLERR /* POLL_HUP */ +}; +EXPORT_SYMBOL(poll_band_table); + + +void file_notify_event(struct file *filep, long *event) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + fcblist_read_lock(filep, flags); + + lsthead = &filep->f_cblist; + list_for_each(lnk, lsthead) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + fcbp->cbproc(filep, fcbp->data, fcbp->local, event); + } + + fcblist_read_unlock(filep, flags); +} +EXPORT_SYMBOL(file_notify_event); + + +int file_notify_addcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data) +{ + unsigned long flags; + struct fcb_struct *fcbp; + + if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL))) + return -ENOMEM; + + memset(fcbp, 0, sizeof(struct fcb_struct)); + fcbp->cbproc = cbproc; + fcbp->data = data; + + fcblist_write_lock(filep, flags); + list_add_tail(&fcbp->llink, &filep->f_cblist); + fcblist_write_unlock(filep, flags); + + return 0; +} +EXPORT_SYMBOL(file_notify_addcb); + + +int file_notify_delcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *)) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + fcblist_write_lock(filep, flags); + + lsthead = &filep->f_cblist; + list_for_each(lnk, lsthead) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + if (fcbp->cbproc == cbproc) { + list_del(lnk); + fcblist_write_unlock(filep, flags); + kfree(fcbp); + return 0; + } + } + + fcblist_write_unlock(filep, flags); + + return -ENOENT; +} +EXPORT_SYMBOL(file_notify_delcb); + + +void file_notify_cleanup(struct file *filep) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + fcblist_write_lock(filep, flags); + + lsthead = &filep->f_cblist; + while ((lnk = list_first(lsthead))) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + list_del(lnk); + fcblist_write_unlock(filep, flags); + kfree(fcbp); + fcblist_write_lock(filep, flags); + } + + fcblist_write_unlock(filep, flags); +} +EXPORT_SYMBOL(file_notify_cleanup); + diff -Nru linux-2.4.18.vanilla/fs/file_table.c linux-2.4.18.epoll/fs/file_table.c --- linux-2.4.18.vanilla/fs/file_table.c Mon Sep 17 13:16:30 2001 +++ linux-2.4.18.epoll/fs/file_table.c Wed Mar 27 19:27:42 2002 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ f->f_uid = current->fsuid; f->f_gid = current->fsgid; list_add(&f->f_list, &anon_list); + file_notify_init(f); file_list_unlock(); return f; } @@ -91,6 +93,7 @@ filp->f_uid = current->fsuid; filp->f_gid = current->fsgid; filp->f_op = dentry->d_inode->i_fop; + file_notify_init(filp); if (filp->f_op->open) return filp->f_op->open(dentry->d_inode, filp); else @@ -104,6 +107,7 @@ struct inode * inode = dentry->d_inode; if (atomic_dec_and_test(&file->f_count)) { + file_notify_cleanup(file); locks_remove_flock(file); if (file->f_iobuf) diff -Nru linux-2.4.18.vanilla/fs/pipe.c linux-2.4.18.epoll/fs/pipe.c --- linux-2.4.18.vanilla/fs/pipe.c Fri Sep 28 18:03:48 2001 +++ linux-2.4.18.epoll/fs/pipe.c Wed Mar 27 19:27:42 2002 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; + int pfull; ssize_t size, read, ret; /* Seeks are not allowed on pipes. */ @@ -72,6 +74,7 @@ PIPE_WAITING_READERS(*inode)++; pipe_wait(inode); PIPE_WAITING_READERS(*inode)--; + pfull = PIPE_FULL(*inode); ret = -ERESTARTSYS; if (signal_pending(current)) goto out; @@ -82,6 +85,8 @@ goto out; } } + else + pfull = PIPE_FULL(*inode); /* Read what data is available. */ ret = -EFAULT; @@ -104,6 +109,9 @@ count -= chars; buf += chars; } + /* Send notification message */ + if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode)) + file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND); /* Cache behaviour optimization */ if (!PIPE_LEN(*inode)) @@ -136,6 +144,7 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; + int pempty; ssize_t free, written, ret; /* Seeks are not allowed on pipes. */ @@ -180,6 +189,7 @@ } /* Copy into available space. */ + pempty = PIPE_EMPTY(*inode); ret = -EFAULT; while (count > 0) { int space; @@ -208,6 +218,9 @@ break; do { + /* Send notification message */ + if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode)) + file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM); /* * Synchronous wake-up: it knows that this process * is going to give up this CPU, so it doesnt have @@ -217,6 +230,7 @@ PIPE_WAITING_WRITERS(*inode)++; pipe_wait(inode); PIPE_WAITING_WRITERS(*inode)--; + pempty = PIPE_EMPTY(*inode); if (signal_pending(current)) goto out; if (!PIPE_READERS(*inode)) @@ -225,6 +239,9 @@ ret = -EFAULT; } + /* Send notification message */ + if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode)) + file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM); /* Signal readers asynchronously that there is more data. */ wake_up_interruptible(PIPE_WAIT(*inode)); @@ -303,9 +320,22 @@ static int pipe_release(struct inode *inode, int decr, int decw) { + struct file *rdfile, *wrfile; down(PIPE_SEM(*inode)); PIPE_READERS(*inode) -= decr; PIPE_WRITERS(*inode) -= decw; + rdfile = PIPE_READFILE(*inode); + wrfile = PIPE_WRITEFILE(*inode); + if (decr && !PIPE_READERS(*inode)) { + PIPE_READFILE(*inode) = NULL; + if (wrfile) + file_send_notify(wrfile, ION_HUP, POLLHUP); + } + if (decw && !PIPE_WRITERS(*inode)) { + PIPE_WRITEFILE(*inode) = NULL; + if (rdfile) + file_send_notify(rdfile, ION_HUP, POLLHUP); + } if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { struct pipe_inode_info *info = inode->i_pipe; inode->i_pipe = NULL; @@ -458,6 +488,7 @@ PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0; PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0; PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; + PIPE_READFILE(*inode) = PIPE_WRITEFILE(*inode) = NULL; return inode; fail_page: @@ -564,6 +595,9 @@ f2->f_op = &write_pipe_fops; f2->f_mode = 2; f2->f_version = 0; + + PIPE_READFILE(*inode) = f1; + PIPE_WRITEFILE(*inode) = f2; fd_install(i, f1); fd_install(j, f2); diff -Nru linux-2.4.18.vanilla/include/asm-i386/poll.h linux-2.4.18.epoll/include/asm-i386/poll.h --- linux-2.4.18.vanilla/include/asm-i386/poll.h Thu Jan 23 11:01:28 1997 +++ linux-2.4.18.epoll/include/asm-i386/poll.h Wed Mar 27 19:27:42 2002 @@ -15,6 +15,7 @@ #define POLLWRNORM 0x0100 #define POLLWRBAND 0x0200 #define POLLMSG 0x0400 +#define POLLREMOVE 0x1000 struct pollfd { int fd; diff -Nru linux-2.4.18.vanilla/include/linux/eventpoll.h linux-2.4.18.epoll/include/linux/eventpoll.h --- linux-2.4.18.vanilla/include/linux/eventpoll.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.18.epoll/include/linux/eventpoll.h Wed Mar 27 19:27:42 2002 @@ -0,0 +1,43 @@ +/* + * include/linux/eventpoll.h + * + * Copyright (C) 2001, Davide Libenzi + * + * Efficent event polling implementation + */ + + +#ifndef _LINUX_EVENTPOLL_H +#define _LINUX_EVENTPOLL_H + + + + +#define EVENTPOLL_MINOR 124 +#define POLLFD_X_PAGE (PAGE_SIZE / sizeof(struct pollfd)) +#define MAX_FDS_IN_EVENTPOLL (1024 * 128) +#define MAX_EVENTPOLL_PAGES (MAX_FDS_IN_EVENTPOLL / POLLFD_X_PAGE) +#define EVENT_PAGE_INDEX(n) ((n) / POLLFD_X_PAGE) +#define EVENT_PAGE_REM(n) ((n) % POLLFD_X_PAGE) +#define EVENT_PAGE_OFFSET(n) (((n) % POLLFD_X_PAGE) * sizeof(struct pollfd)) +#define EP_FDS_PAGES(n) (((n) + POLLFD_X_PAGE - 1) / POLLFD_X_PAGE) +#define EP_MAP_SIZE(n) (EP_FDS_PAGES(n) * PAGE_SIZE * 2) + + + + + +struct evpoll { + int ep_timeout; + unsigned long ep_resoff; +}; + +#define EP_ALLOC _IOR('P', 1, int) +#define EP_POLL _IOWR('P', 2, struct evpoll) +#define EP_FREE _IO('P', 3) +#define EP_ISPOLLED _IOWR('P', 4, struct pollfd) + + + +#endif + diff -Nru linux-2.4.18.vanilla/include/linux/fcblist.h linux-2.4.18.epoll/include/linux/fcblist.h --- linux-2.4.18.vanilla/include/linux/fcblist.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.18.epoll/include/linux/fcblist.h Wed Mar 27 19:43:03 2002 @@ -0,0 +1,67 @@ +/* + * include/linux/fcblist.h + * + * Copyright (C) 2001, Davide Libenzi + * + * Handle file callbacks + */ + +#ifndef __LINUX_FCBLIST_H +#define __LINUX_FCBLIST_H + +#include +#include +#include +#include + + + +/* file callback notification events */ +#define ION_IN 1 +#define ION_OUT 2 +#define ION_HUP 3 +#define ION_ERR 4 + +#define FCB_LOCAL_SIZE 4 + +#define fcblist_read_lock(fp, fl) read_lock_irqsave(&(fp)->f_cblock, fl) +#define fcblist_read_unlock(fp, fl) read_unlock_irqrestore(&(fp)->f_cblock, fl) +#define fcblist_write_lock(fp, fl) write_lock_irqsave(&(fp)->f_cblock, fl) +#define fcblist_write_unlock(fp, fl) write_unlock_irqrestore(&(fp)->f_cblock, fl) + +struct fcb_struct { + struct list_head llink; + void (*cbproc)(struct file *, void *, unsigned long *, long *); + void *data; + unsigned long local[FCB_LOCAL_SIZE]; +}; + + +extern long ion_band_table[]; +extern long poll_band_table[]; + + +void file_notify_event(struct file *filep, long *event); + +int file_notify_addcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data); + +int file_notify_delcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *)); + +void file_notify_cleanup(struct file *filep); + + +static inline void file_notify_init(struct file *filep) +{ + rwlock_init(&filep->f_cblock); + INIT_LIST_HEAD(&filep->f_cblist); +} + +static inline void file_send_notify(struct file *filep, long ioevt, long plevt) { + long event[] = { ioevt, plevt, -1 }; + + file_notify_event(filep, event); +} + +#endif diff -Nru linux-2.4.18.vanilla/include/linux/fs.h linux-2.4.18.epoll/include/linux/fs.h --- linux-2.4.18.vanilla/include/linux/fs.h Tue Mar 19 18:38:28 2002 +++ linux-2.4.18.epoll/include/linux/fs.h Wed Mar 27 19:42:18 2002 @@ -535,6 +535,10 @@ /* needed for tty driver, and maybe others */ void *private_data; + /* file callback list */ + rwlock_t f_cblock; + struct list_head f_cblist; + /* preallocated helper kiobuf to speedup O_DIRECT */ struct kiobuf *f_iobuf; long f_iobuf_lock; diff -Nru linux-2.4.18.vanilla/include/linux/list.h linux-2.4.18.epoll/include/linux/list.h --- linux-2.4.18.vanilla/include/linux/list.h Tue Mar 19 18:38:28 2002 +++ linux-2.4.18.epoll/include/linux/list.h Wed Mar 27 19:42:18 2002 @@ -172,6 +172,11 @@ pos = pos->prev, prefetch(pos->prev)) +#define list_first(head) (((head)->next != (head)) ? (head)->next: (struct list_head *) 0) +#define list_last(head) (((head)->prev != (head)) ? (head)->prev: (struct list_head *) 0) +#define list_next(pos, head) (((pos)->next != (head)) ? (pos)->next: (struct list_head *) 0) +#define list_prev(pos, head) (((pos)->prev != (head)) ? (pos)->prev: (struct list_head *) 0) + #endif /* __KERNEL__ || _LVM_H_INCLUDE */ #endif diff -Nru linux-2.4.18.vanilla/include/linux/pipe_fs_i.h linux-2.4.18.epoll/include/linux/pipe_fs_i.h --- linux-2.4.18.vanilla/include/linux/pipe_fs_i.h Wed Apr 25 14:18:23 2001 +++ linux-2.4.18.epoll/include/linux/pipe_fs_i.h Wed Mar 27 19:27:42 2002 @@ -13,6 +13,8 @@ unsigned int waiting_writers; unsigned int r_counter; unsigned int w_counter; + struct file *rdfile; + struct file *wrfile; }; /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual @@ -30,6 +32,8 @@ #define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers) #define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter) #define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter) +#define PIPE_READFILE(inode) ((inode).i_pipe->rdfile) +#define PIPE_WRITEFILE(inode) ((inode).i_pipe->wrfile) #define PIPE_EMPTY(inode) (PIPE_LEN(inode) == 0) #define PIPE_FULL(inode) (PIPE_LEN(inode) == PIPE_SIZE) diff -Nru linux-2.4.18.vanilla/include/net/sock.h linux-2.4.18.epoll/include/net/sock.h --- linux-2.4.18.vanilla/include/net/sock.h Tue Mar 19 18:39:33 2002 +++ linux-2.4.18.epoll/include/net/sock.h Wed Mar 27 19:45:52 2002 @@ -105,7 +105,9 @@ #include #include - +#include +#include +#include /* The AF_UNIX specific socket options */ struct unix_opt { @@ -1215,8 +1217,13 @@ static inline void sk_wake_async(struct sock *sk, int how, int band) { - if (sk->socket && sk->socket->fasync_list) + if (sk->socket) { + if (sk->socket->file) + file_send_notify(sk->socket->file, ion_band_table[band - POLL_IN], + poll_band_table[band - POLL_IN]); + if (sk->socket->fasync_list) sock_wake_async(sk->socket, how, band); + } } #define SOCK_MIN_SNDBUF 2048 diff -Nru linux-2.4.18.vanilla/net/ipv4/tcp.c linux-2.4.18.epoll/net/ipv4/tcp.c --- linux-2.4.18.vanilla/net/ipv4/tcp.c Fri Dec 21 09:42:05 2001 +++ linux-2.4.18.epoll/net/ipv4/tcp.c Wed Mar 27 19:27:42 2002 @@ -470,8 +470,8 @@ if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible(sk->sleep); - if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN)) - sock_wake_async(sock, 2, POLL_OUT); + if (!(sk->shutdown&SEND_SHUTDOWN)) + sk_wake_async(sk, 2, POLL_OUT); } }