diff -urNp ref/arch/i386/kernel/entry.S 2.4.20pre7aa1/arch/i386/kernel/entry.S --- ref/arch/i386/kernel/entry.S Wed Sep 18 01:07:06 2002 +++ 2.4.20pre7aa1/arch/i386/kernel/entry.S Wed Sep 18 01:07:08 2002 @@ -671,7 +671,7 @@ ENTRY(sys_call_table) .long SYMBOL_NAME(sys_fremovexattr) .long SYMBOL_NAME(sys_tkill) .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sendfile64 */ - .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */ + .long SYMBOL_NAME(sys_futex) /* 240 */ .long SYMBOL_NAME(sys_sched_setaffinity) .long SYMBOL_NAME(sys_sched_getaffinity) .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_thread_area */ diff -urNp ref/arch/ppc/kernel/misc.S 2.4.20pre7aa1/arch/ppc/kernel/misc.S --- ref/arch/ppc/kernel/misc.S Wed Sep 18 01:06:51 2002 +++ 2.4.20pre7aa1/arch/ppc/kernel/misc.S Wed Sep 18 01:07:26 2002 @@ -1171,7 +1171,7 @@ _GLOBAL(sys_call_table) .long sys_removexattr .long sys_lremovexattr .long sys_fremovexattr /* 220 */ - .long sys_ni_syscall /* reserved for sys_futex */ + .long sys_futex .long sys_sched_setaffinity .long sys_sched_getaffinity .long sys_ni_syscall /* reserved for sys_security */ diff -urNp ref/fs/dnotify.c 2.4.20pre7aa1/fs/dnotify.c --- ref/fs/dnotify.c Fri Sep 13 06:13:49 2002 +++ 2.4.20pre7aa1/fs/dnotify.c Wed Sep 18 01:07:08 2002 @@ -20,8 +20,6 @@ #include #include -extern void send_sigio(struct fown_struct *fown, int fd, int band); - int dir_notify_enable = 1; static rwlock_t dn_lock = RW_LOCK_UNLOCKED; diff -urNp ref/include/asm-alpha/mman.h 2.4.20pre7aa1/include/asm-alpha/mman.h --- ref/include/asm-alpha/mman.h Thu Mar 16 23:07:09 2000 +++ 2.4.20pre7aa1/include/asm-alpha/mman.h Wed Sep 18 01:07:08 2002 @@ -4,6 +4,7 @@ #define PROT_READ 0x1 /* page can be read */ #define PROT_WRITE 0x2 /* page can be written */ #define PROT_EXEC 0x4 /* page can be executed */ +#define PROT_SEM 0x8 /* page may be used for atomic ops */ #define PROT_NONE 0x0 /* page can not be accessed */ #define MAP_SHARED 0x01 /* Share changes */ diff -urNp ref/include/asm-i386/mman.h 2.4.20pre7aa1/include/asm-i386/mman.h --- ref/include/asm-i386/mman.h Wed Mar 15 02:45:20 2000 +++ 2.4.20pre7aa1/include/asm-i386/mman.h Wed Sep 18 01:07:08 2002 @@ -4,6 +4,7 @@ #define PROT_READ 0x1 /* page can be read */ #define PROT_WRITE 0x2 /* page can be written */ #define PROT_EXEC 0x4 /* page can be executed */ +#define PROT_SEM 0x8 /* page may be used for atomic ops */ #define PROT_NONE 0x0 /* page can not be accessed */ #define MAP_SHARED 0x01 /* Share changes */ diff -urNp ref/include/asm-ppc/mman.h 2.4.20pre7aa1/include/asm-ppc/mman.h --- ref/include/asm-ppc/mman.h Tue Jan 22 18:51:12 2002 +++ 2.4.20pre7aa1/include/asm-ppc/mman.h Wed Sep 18 01:07:08 2002 @@ -7,6 +7,7 @@ #define PROT_READ 0x1 /* page can be read */ #define PROT_WRITE 0x2 /* page can be written */ #define PROT_EXEC 0x4 /* page can be executed */ +#define PROT_SEM 0x8 /* page may be used for atomic ops */ #define PROT_NONE 0x0 /* page can not be accessed */ #define MAP_SHARED 0x01 /* Share changes */ diff -urNp ref/include/linux/fs.h 2.4.20pre7aa1/include/linux/fs.h --- ref/include/linux/fs.h Wed Sep 18 01:07:06 2002 +++ 2.4.20pre7aa1/include/linux/fs.h Wed Sep 18 01:07:08 2002 @@ -752,6 +752,7 @@ extern int __get_lease(struct inode *ino extern time_t lease_get_mtime(struct inode *); extern int lock_may_read(struct inode *, loff_t start, unsigned long count); extern int lock_may_write(struct inode *, loff_t start, unsigned long count); +extern void send_sigio(struct fown_struct *fown, int fd, int band); struct fasync_struct { int magic; diff -urNp ref/include/linux/futex.h 2.4.20pre7aa1/include/linux/futex.h --- ref/include/linux/futex.h Thu Jan 1 01:00:00 1970 +++ 2.4.20pre7aa1/include/linux/futex.h Wed Sep 18 01:07:08 2002 @@ -0,0 +1,9 @@ +#ifndef _LINUX_FUTEX_H +#define _LINUX_FUTEX_H + +/* Second argument to futex syscall */ +#define FUTEX_WAIT (0) +#define FUTEX_WAKE (1) +#define FUTEX_FD (2) + +#endif diff -urNp ref/include/linux/hash.h 2.4.20pre7aa1/include/linux/hash.h --- ref/include/linux/hash.h Thu Jan 1 01:00:00 1970 +++ 2.4.20pre7aa1/include/linux/hash.h Wed Sep 18 01:07:08 2002 @@ -0,0 +1,41 @@ +#ifndef _LINUX_HASH_H +#define _LINUX_HASH_H +/* Fast hashing routine for a long. + (C) 2002 William Lee Irwin III, IBM */ + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +#if BITS_PER_LONG == 32 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e370001UL +#elif BITS_PER_LONG == 64 +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL +#else +#error Define GOLDEN_RATIO_PRIME for your wordsize. +#endif + +static inline unsigned long hash_long(unsigned long val, unsigned int bits) +{ + unsigned long hash = val; + + /* On some cpus multiply is faster, on others gcc will do shifts */ + hash *= GOLDEN_RATIO_PRIME; + + /* High bits are more random, so use them. */ + return hash >> (BITS_PER_LONG - bits); +} + +static inline unsigned long hash_ptr(void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr, bits); +} +#endif /* _LINUX_HASH_H */ diff -urNp ref/kernel/Makefile 2.4.20pre7aa1/kernel/Makefile --- ref/kernel/Makefile Wed Sep 18 01:06:57 2002 +++ 2.4.20pre7aa1/kernel/Makefile Wed Sep 18 01:07:08 2002 @@ -14,7 +14,7 @@ export-objs = signal.o sys.o kmod.o cont obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \ module.o exit.o itimer.o info.o time.o softirq.o resource.o \ sysctl.o acct.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o context.o rcupdate.o + signal.o sys.o kmod.o context.o rcupdate.o futex.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o diff -urNp ref/kernel/futex.c 2.4.20pre7aa1/kernel/futex.c --- ref/kernel/futex.c Thu Jan 1 01:00:00 1970 +++ 2.4.20pre7aa1/kernel/futex.c Wed Sep 18 01:07:08 2002 @@ -0,0 +1,403 @@ +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Simple "sleep if unchanged" interface. */ + +/* FIXME: This may be way too small. --RR */ +#define FUTEX_HASHBITS 6 + +extern void send_sigio(struct fown_struct *fown, int fd, int band); + +/* Everyone needs a dentry and inode */ +static struct vfsmount *futex_mnt; + +/* We use this instead of a normal wait_queue_t, so we can wake only + the relevent ones (hashed queues may be shared) */ +struct futex_q { + struct list_head list; + wait_queue_head_t waiters; + /* Page struct and offset within it. */ + struct page *page; + unsigned int offset; + /* For fd, sigio sent using these. */ + int fd; + struct file *filp; +}; + +/* The key for the hash is the address + index + offset within page */ +static struct list_head futex_queues[1<waiters); + if (q->filp) + send_sigio(&q->filp->f_owner, q->fd, POLL_IN); +} + +static inline void unpin_page(struct page *page) +{ + /* Avoid releasing the page which is on the LRU list. I don't + know if this is correct, but it stops the BUG() in + __free_pages_ok(). */ + page_cache_release(page); +} + +static int futex_wake(struct list_head *head, + struct page *page, + unsigned int offset, + int num) +{ + struct list_head *i, *next; + int num_woken = 0; + + spin_lock(&futex_lock); + list_for_each_safe(i, next, head) { + struct futex_q *this = list_entry(i, struct futex_q, list); + + if (this->page == page && this->offset == offset) { + list_del_init(i); + tell_waiter(this); + num_woken++; + if (num_woken >= num) break; + } + } + spin_unlock(&futex_lock); + return num_woken; +} + +/* Add at end to avoid starvation */ +static inline void queue_me(struct list_head *head, + struct futex_q *q, + struct page *page, + unsigned int offset, + int fd, + struct file *filp) +{ + q->page = page; + q->offset = offset; + q->fd = fd; + q->filp = filp; + + spin_lock(&futex_lock); + list_add_tail(&q->list, head); + spin_unlock(&futex_lock); +} + +/* Return 1 if we were still queued (ie. 0 means we were woken) */ +static inline int unqueue_me(struct futex_q *q) +{ + int ret = 0; + spin_lock(&futex_lock); + if (!list_empty(&q->list)) { + list_del(&q->list); + ret = 1; + } + spin_unlock(&futex_lock); + return ret; +} + +/* Get kernel address of the user page and pin it. */ +static struct page *pin_page(unsigned long page_start) +{ + struct mm_struct *mm = current->mm; + struct page *page; + int err; + + down_read(&mm->mmap_sem); + err = get_user_pages(current, mm, page_start, + 1 /* one page */, + 0 /* writable not important */, + 0 /* don't force */, + &page, + NULL /* don't return vmas */); + up_read(&mm->mmap_sem); + + if (err < 0) + return ERR_PTR(err); + return page; +} + +static int futex_wait(struct list_head *head, + struct page *page, + int offset, + int val, + int *uaddr, + unsigned long time) +{ + int curval; + struct futex_q q; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + init_waitqueue_head(&q.waiters); + add_wait_queue(&q.waiters, &wait); + queue_me(head, &q, page, offset, -1, NULL); + + /* Page is pinned, but may no longer be in this address space. */ + if (get_user(curval, uaddr) != 0) { + ret = -EFAULT; + goto out; + } + + if (curval != val) { + ret = -EWOULDBLOCK; + goto out; + } + time = schedule_timeout(time); + if (time == 0) { + ret = -ETIMEDOUT; + goto out; + } + if (signal_pending(current)) { + ret = -EINTR; + goto out; + } + out: + set_current_state(TASK_RUNNING); + /* Were we woken up anyway? If so, it unpinned page. */ + if (!unqueue_me(&q)) + return 0; + return ret; +} + +static int futex_close(struct inode *inode, struct file *filp) +{ + struct futex_q *q = filp->private_data; + + spin_lock(&futex_lock); + if (!list_empty(&q->list)) { + list_del(&q->list); + /* Noone can be polling on us now. */ + BUG_ON(waitqueue_active(&q->waiters)); + } + spin_unlock(&futex_lock); + unpin_page(q->page); + kfree(filp->private_data); + return 0; +} + +/* This is one-shot: once it's gone off you need a new fd */ +static unsigned int futex_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct futex_q *q = filp->private_data; + int ret = 0; + + poll_wait(filp, &q->waiters, wait); + spin_lock(&futex_lock); + if (list_empty(&q->list)) + ret = POLLIN | POLLRDNORM; + spin_unlock(&futex_lock); + + return ret; +} + +static struct file_operations futex_fops = { + release: futex_close, + poll: futex_poll, +}; + +/* Signal allows caller to avoid the race which would occur if they + set the sigio stuff up afterwards. */ +static int futex_fd(struct list_head *head, + struct page *page, + int offset, + int signal) +{ + int fd; + struct futex_q *q; + struct file *filp; + + if (signal < 0 || signal > _NSIG) + return -EINVAL; + + fd = get_unused_fd(); + if (fd < 0) + return fd; + filp = get_empty_filp(); + if (!filp) { + put_unused_fd(fd); + return -ENFILE; + } + filp->f_op = &futex_fops; + filp->f_vfsmnt = mntget(futex_mnt); + filp->f_dentry = dget(futex_mnt->mnt_root); + + if (signal) { + filp->f_owner.pid = current->tgid; + filp->f_owner.uid = current->uid; + filp->f_owner.euid = current->euid; + filp->f_owner.signum = signal; + } + + q = kmalloc(sizeof(*q), GFP_KERNEL); + if (!q) { + put_unused_fd(fd); + put_filp(filp); + return -ENOMEM; + } + + /* Initialize queue structure */ + init_waitqueue_head(&q->waiters); + filp->private_data = q; + + /* Go for it... */ + queue_me(head, q, page, offset, fd, filp); + + /* Now we map fd to filp, so userspace can access it */ + fd_install(fd, filp); + return fd; +} + +asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime) +{ + int ret; + unsigned long pos_in_page; + struct list_head *head; + struct page *page; + unsigned long time = MAX_SCHEDULE_TIMEOUT; + + if (utime) { + struct timespec t; + if (copy_from_user(&t, utime, sizeof(t)) != 0) + return -EFAULT; + time = timespec_to_jiffies(&t) + 1; + } + + pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE; + + /* Must be "naturally" aligned, and not on page boundary. */ + if ((pos_in_page % __alignof__(int)) != 0 + || pos_in_page + sizeof(int) > PAGE_SIZE) + return -EINVAL; + + /* Simpler if it doesn't vanish underneath us. */ + page = pin_page((unsigned long)uaddr - pos_in_page); + if (IS_ERR(page)) + return PTR_ERR(page); + + /* On success, these routines unpin the pages themselves. */ + head = hash_futex(page, pos_in_page); + switch (op) { + case FUTEX_WAIT: + ret = futex_wait(head, page, pos_in_page, val, uaddr, time); + break; + case FUTEX_WAKE: + ret = futex_wake(head, page, pos_in_page, val); + break; + case FUTEX_FD: + /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ + ret = futex_fd(head, page, pos_in_page, val); + if (ret >= 0) + /* Leave page pinned (attached to fd). */ + return ret; + break; + default: + ret = -EINVAL; + } + unpin_page(page); + + return ret; +} + +/* FIXME: Oh yeah, makes sense to write a filesystem... */ +static int futexfs_statfs(struct super_block *sb, struct statfs *buf) +{ + buf->f_type = 0xBAD1DEA; + buf->f_bsize = 1024; + buf->f_namelen = 255; + return 0; +} + +static struct super_operations futexfs_ops = { statfs: futexfs_statfs }; + +/* Don't check error returns: we're dead if they happen */ +static struct super_block * +futexfs_read_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root; + + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = 0xBAD1DEA; + sb->s_op = &futexfs_ops; + + root = new_inode(sb); + root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; + root->i_uid = root->i_gid = 0; + root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; + + sb->s_root = d_alloc(NULL, &(const struct qstr) { "futex", 5, 0 }); + sb->s_root->d_sb = sb; + sb->s_root->d_parent = sb->s_root; + d_instantiate(sb->s_root, root); + + return sb; +} + +static DECLARE_FSTYPE(futex_fs_type,"futexfs",futexfs_read_super,FS_NOMOUNT); +static int __init init(void) +{ + unsigned int i; + + register_filesystem(&futex_fs_type); + futex_mnt = kern_mount(&futex_fs_type); + + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) + INIT_LIST_HEAD(&futex_queues[i]); + return 0; +} +__initcall(init); diff -urNp ref/mm/mprotect.c 2.4.20pre7aa1/mm/mprotect.c --- ref/mm/mprotect.c Wed Sep 18 01:07:06 2002 +++ 2.4.20pre7aa1/mm/mprotect.c Wed Sep 18 01:07:08 2002 @@ -278,7 +278,7 @@ asmlinkage long sys_mprotect(unsigned lo end = start + len; if (end < start) return -EINVAL; - if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) return -EINVAL; if (end == start) return 0;