diff -urNp x-ref/Documentation/AIO-NOTES x/Documentation/AIO-NOTES
--- x-ref/Documentation/AIO-NOTES	1970-01-01 01:00:00.000000000 +0100
+++ x/Documentation/AIO-NOTES	2003-02-14 05:22:30.000000000 +0100
@@ -0,0 +1,3 @@
+- aio context destruction is now synchronous: it waits for all pending 
+  ios to complete.  This will now cause a task that is exiting to be 
+  delayed if outstanding ios are executing.
diff -urNp x-ref/MAINTAINERS x/MAINTAINERS
--- x-ref/MAINTAINERS	2003-02-14 05:22:25.000000000 +0100
+++ x/MAINTAINERS	2003-02-14 05:22:30.000000000 +0100
@@ -237,6 +237,12 @@ M:	layes@loran.com
 L:	linux-net@vger.kernel.org
 S:	Maintained
 
+ASYNC IO
+P:	Benjamin LaHaise
+M:	bcrl@redhat.com
+L:	linux-aio@kvack.org
+S:	Maintained
+
 AX.25 NETWORK LAYER
 P:	Matthias Welwarsky
 M:	dg2fef@afthd.tu-darmstadt.de
diff -urNp x-ref/arch/i386/kernel/entry.S x/arch/i386/kernel/entry.S
--- x-ref/arch/i386/kernel/entry.S	2003-02-14 05:22:21.000000000 +0100
+++ x/arch/i386/kernel/entry.S	2003-02-14 05:22:30.000000000 +0100
@@ -659,11 +659,11 @@ ENTRY(sys_call_table)
 	.long SYMBOL_NAME(sys_sched_getaffinity)
 	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_set_thread_area */
 	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_get_thread_area */
-	.long SYMBOL_NAME(sys_ni_syscall)	/* 245 sys_io_setup */
-	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_io_destroy */
-	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_io_getevents */
-	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_io_submit */
-	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_io_cancel */
+	.long SYMBOL_NAME(sys_io_setup)		/* 245 */
+	.long SYMBOL_NAME(sys_io_destroy)
+	.long SYMBOL_NAME(sys_io_getevents)
+	.long SYMBOL_NAME(sys_io_submit)
+	.long SYMBOL_NAME(sys_io_cancel)
 	.long SYMBOL_NAME(sys_ni_syscall)	/* 250 sys_alloc_hugepages */
 	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_free_hugepages */
 	.long SYMBOL_NAME(sys_ni_syscall)	/* sys_exit_group */
diff -urNp x-ref/arch/i386/kernel/semaphore.c x/arch/i386/kernel/semaphore.c
--- x-ref/arch/i386/kernel/semaphore.c	2002-11-29 02:22:55.000000000 +0100
+++ x/arch/i386/kernel/semaphore.c	2003-02-14 05:22:30.000000000 +0100
@@ -14,6 +14,7 @@
  */
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/worktodo.h>
 #include <asm/semaphore.h>
 
 /*
@@ -54,6 +55,54 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd);
+
+void __wtd_down_action(void *data)
+{
+	struct worktodo *wtd = data;
+	struct semaphore *sem;
+
+	wtd_pop(wtd);
+	sem = wtd->data;
+
+	__wtd_down(sem, wtd);
+}
+
+void __wtd_down_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct semaphore *sem = wtd->data;
+
+	__remove_wait_queue(&sem->wait, &wtd->wait);
+	wtd_push(wtd, __wtd_down_action, wtd);
+	wtd_queue(wtd);
+}
+
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd)
+{
+	int gotit;
+	int sleepers;
+
+	init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter);
+	wtd->data = sem;
+
+	spin_lock_irq(&semaphore_lock);
+	sem->sleepers++;
+	sleepers = sem->sleepers;
+	gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait,
+			atomic_add_negative(sleepers - 1, &sem->count));
+	if (gotit)
+		sem->sleepers = 0;
+	else
+		sem->sleepers = 1;
+	spin_unlock_irq(&semaphore_lock);
+
+	if (gotit) {
+		wake_up(&sem->wait);
+		wtd_queue(wtd);
+	}
+}
+
 void __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
@@ -257,6 +306,21 @@ asm(
 	"ret"
 );
 
+asm(
+".text\n"
+".align 4\n"
+".globl __wtd_down_failed\n"
+"__wtd_down_failed:\n\t"
+	"pushl %eax\n\t"
+	"pushl %edx\n\t"
+	"pushl %ecx\n\t"
+	"call __wtd_down\n\t"
+	"popl %ecx\n\t"
+	"popl %edx\n\t"
+	"popl %eax\n\t"
+	"ret"
+);
+
 /*
  * rw spinlock fallbacks
  */
diff -urNp x-ref/arch/x86_64/kernel/semaphore.c x/arch/x86_64/kernel/semaphore.c
--- x-ref/arch/x86_64/kernel/semaphore.c	2002-11-29 02:22:58.000000000 +0100
+++ x/arch/x86_64/kernel/semaphore.c	2003-02-14 05:22:30.000000000 +0100
@@ -14,6 +14,7 @@
  */
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/worktodo.h>
 
 #include <asm/semaphore.h>
 
@@ -167,4 +168,51 @@ int __down_trylock(struct semaphore * se
 	return 1;
 }
 
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd);
+
+void __wtd_down_action(void *data)
+{
+	struct worktodo *wtd = data;
+	struct semaphore *sem;
+
+	wtd_pop(wtd);
+	sem = wtd->data;
+
+	__wtd_down(sem, wtd);
+}
+
+void __wtd_down_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct semaphore *sem = wtd->data;
+
+	__remove_wait_queue(&sem->wait, &wtd->wait);
+	wtd_push(wtd, __wtd_down_action, wtd);
+	wtd_queue(wtd);
+}
+
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd)
+{
+	int gotit;
+	int sleepers;
+
+	init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter);
+	wtd->data = sem;
+
+	spin_lock_irq(&semaphore_lock);
+	sem->sleepers++;
+	sleepers = sem->sleepers;
+	gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait,
+			atomic_add_negative(sleepers - 1, &sem->count));
+	if (gotit)
+		sem->sleepers = 0;
+	else
+		sem->sleepers = 1;
+	spin_unlock_irq(&semaphore_lock);
+
+	if (gotit) {
+		wake_up(&sem->wait);
+		wtd_queue(wtd);
+	}
+}
 
diff -urNp x-ref/arch/x86_64/lib/thunk.S x/arch/x86_64/lib/thunk.S
--- x-ref/arch/x86_64/lib/thunk.S	2002-11-29 02:22:58.000000000 +0100
+++ x/arch/x86_64/lib/thunk.S	2003-02-14 05:22:30.000000000 +0100
@@ -41,7 +41,8 @@
 	thunk_retrax __down_failed_interruptible,__down_interruptible
 	thunk_retrax __down_failed_trylock,__down_trylock
 	thunk __up_wakeup,__up
-	
+	thunk __wtd_down_failed,__wtd_down
+		
 restore:
 	RESTORE_ARGS
 	ret	
diff -urNp x-ref/drivers/char/raw.c x/drivers/char/raw.c
--- x-ref/drivers/char/raw.c	2003-02-14 05:22:15.000000000 +0100
+++ x/drivers/char/raw.c	2003-02-14 05:22:30.000000000 +0100
@@ -16,6 +16,8 @@
 #include <linux/capability.h>
 #include <linux/smp_lock.h>
 #include <asm/uaccess.h>
+#include <linux/kiovec.h>
+#include <linux/slab.h>
 
 #define dprintk(x...) 
 
@@ -36,7 +38,8 @@ int	raw_open(struct inode *, struct file
 int	raw_release(struct inode *, struct file *);
 int	raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
 int	raw_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
-
+int	raw_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos);
+int	raw_kvec_write(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos);
 
 static struct file_operations raw_fops = {
 	read:		raw_read,
@@ -44,6 +47,10 @@ static struct file_operations raw_fops =
 	open:		raw_open,
 	release:	raw_release,
 	ioctl:		raw_ioctl,
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
+	kvec_read:	raw_kvec_read,
+	kvec_write:	raw_kvec_write,
 };
 
 static struct file_operations raw_ctl_fops = {
@@ -407,3 +414,100 @@ ssize_t	rw_raw_dev(int rw, struct file *
  out:	
 	return err;
 }
+
+static int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos);
+int raw_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return raw_kvec_rw(file, READ, cb, size, pos);
+}
+
+int raw_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return raw_kvec_rw(file, WRITE, cb, size, pos);
+}
+
+int	raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	int		err;
+	unsigned	minor;
+	kdev_t		dev;
+	unsigned long	limit, blocknr, blocks;
+
+	unsigned	sector_size, sector_bits, sector_mask;
+	unsigned	max_sectors;
+	unsigned	i;
+
+	pr_debug("raw_kvec_rw: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos);
+	/*
+	 * First, a few checks on device size limits 
+	 */
+
+	minor = MINOR(filp->f_dentry->d_inode->i_rdev);
+	dev = to_kdev_t(raw_devices[minor].binding->bd_dev);
+	sector_size = raw_devices[minor].sector_size;
+	sector_bits = raw_devices[minor].sector_bits;
+	sector_mask = sector_size- 1;
+	max_sectors = KIO_MAX_SECTORS >> (sector_bits - 9);
+	
+	if (blk_size[MAJOR(dev)])
+		limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits;
+	else
+		limit = INT_MAX;
+	pr_debug ("raw_kvec_rw: dev %d:%d (+%d)\n",
+		 MAJOR(dev), MINOR(dev), limit);
+
+	/* EOF at the end */
+	err = 0;
+	if (!size || (pos >> sector_bits) == limit) {
+		pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits);
+		cb.fn(cb.data, cb.vec, err);
+		return 0;
+	}
+
+	/* ENXIO for io beyond the end */
+	err = -ENXIO;
+	if ((pos >> sector_bits) >= limit) {
+		pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits);
+		goto out;
+	}
+
+	err = -EINVAL;
+	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
+		pr_debug("pos(%Ld)/size(%lu) wrong(%d)\n", pos, size, sector_mask);
+		goto out;
+	}
+
+	/* Verify that the scatter-gather list is sector aligned. */
+	for (i=0; i<cb.vec->nr; i++)
+		if ((cb.vec->veclet[i].offset & sector_mask) ||
+		    (cb.vec->veclet[i].length & sector_mask)) {
+			pr_debug("veclet offset/length wrong");
+			goto out;
+		}
+
+	/*
+	 * Split the IO into KIO_MAX_SECTORS chunks, mapping and
+	 * unmapping the single kiobuf as we go to perform each chunk of
+	 * IO.  
+	 */
+
+	blocknr = pos >> sector_bits;
+	blocks = size >> sector_bits;
+	if (blocks > max_sectors)
+		blocks = max_sectors;
+	if (blocks > limit - blocknr)
+		blocks = limit - blocknr;
+	err = -ENXIO;
+	if (!blocks) {
+		pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr);
+		goto out;
+	}
+
+	err = brw_kvec_async(rw, cb, dev, blocks, blocknr, sector_bits);
+	async_run_tq_disk();
+out:
+	if (err)
+		printk(KERN_DEBUG "raw_kvec_rw: ret is %d\n", err);
+	return err;
+}
+
diff -urNp x-ref/fs/Makefile x/fs/Makefile
--- x-ref/fs/Makefile	2003-02-14 05:22:25.000000000 +0100
+++ x/fs/Makefile	2003-02-14 05:22:30.000000000 +0100
@@ -20,6 +20,9 @@ obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
 
+obj-y += aio.o
+export-objs += aio.o
+
 subdir-$(CONFIG_PROC_FS)	+= proc
 subdir-y			+= partitions
 
diff -urNp x-ref/fs/aio.c x/fs/aio.c
--- x-ref/fs/aio.c	1970-01-01 01:00:00.000000000 +0100
+++ x/fs/aio.c	2003-02-14 05:22:30.000000000 +0100
@@ -0,0 +1,1377 @@
+/*
+ *	An async IO implementation for Linux
+ *	Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ *	Implements an efficient asynchronous io interface.
+ *
+ *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ *
+ *	See ../COPYING for licensing terms.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/aio_abi.h>
+
+//#define DEBUG 1
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/vmalloc.h>
+#include <linux/iobuf.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/brlock.h>
+#include <linux/aio.h>
+#include <linux/smp_lock.h>
+#include <linux/compiler.h>
+#include <linux/brlock.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+
+#if DEBUG > 1
+#define dprintk		printk
+#else
+#define dprintk(x...)	do { ; } while (0)
+#endif
+
+/*------ sysctl variables----*/
+unsigned aio_nr;		/* current system wide number of aio requests */
+unsigned aio_max_nr = 0x10000;	/* system wide maximum number of aio requests */
+unsigned aio_max_size = 0x20000;	/* 128KB per chunk */
+unsigned aio_max_pinned;		/* set to mem/4 in aio_setup */
+/*----end sysctl variables---*/
+
+static kmem_cache_t	*kiocb_cachep;
+static kmem_cache_t	*kioctx_cachep;
+
+/* Used for rare fput completion. */
+static void aio_fput_routine(void *);
+static struct tq_struct	fput_tqueue = {
+	routine:	aio_fput_routine,
+};
+
+static spinlock_t	fput_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(fput_head);
+
+/* forward prototypes */
+static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res);
+static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res);
+
+/* aio_setup
+ *	Creates the slab caches used by the aio routines, panic on
+ *	failure as this is done early during the boot sequence.
+ */
+static int __init aio_setup(void)
+{
+	kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kiocb_cachep)
+		panic("unable to create kiocb cache\n");
+
+	kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kioctx_cachep)
+		panic("unable to create kioctx cache");
+
+	aio_max_pinned = num_physpages/4;
+
+	printk(KERN_NOTICE "aio_setup: num_physpages = %u\n", aio_max_pinned);
+	printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+
+	return 0;
+}
+
+static void ioctx_free_reqs(struct kioctx *ctx)
+{
+	struct list_head *pos, *next;
+	list_for_each_safe(pos, next, &ctx->free_reqs) {
+		struct kiocb *iocb = list_kiocb(pos);
+		list_del(&iocb->list);
+		kmem_cache_free(kiocb_cachep, iocb);
+	}
+}
+
+static void aio_free_ring(struct kioctx *ctx)
+{
+	struct aio_ring_info *info = &ctx->ring_info;
+
+	if (info->kvec) {
+		unmap_kvec(info->kvec, 1);
+		free_kvec(info->kvec);
+	}
+
+	if (info->mmap_size) {
+		down_write(&ctx->mm->mmap_sem);
+		do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
+		up_write(&ctx->mm->mmap_sem);
+	}
+
+	if (info->ring_pages && info->ring_pages != info->internal_pages)
+		kfree(info->ring_pages);
+	info->ring_pages = NULL;
+	info->nr = 0;
+}
+
+static int aio_setup_ring(struct kioctx *ctx)
+{
+	struct aio_ring *ring;
+	struct aio_ring_info *info = &ctx->ring_info;
+	unsigned nr_reqs = ctx->max_reqs;
+	unsigned long size;
+	int nr_pages, i;
+
+	/* Compensate for the ring buffer's head/tail overlap entry */
+	nr_reqs += 2;	/* 1 is required, 2 for good luck */
+
+	size = sizeof(struct aio_ring);
+	size += sizeof(struct io_event) * nr_reqs;
+	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	if (nr_pages < 0)
+		return -EINVAL;
+
+	info->nr_pages = nr_pages;
+
+	nr_reqs = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+
+	info->nr = 0;
+	info->ring_pages = info->internal_pages;
+	if (nr_pages > AIO_RING_PAGES) {
+		info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+		if (!info->ring_pages)
+			return -ENOMEM;
+		memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
+	}
+
+	info->mmap_size = nr_pages * PAGE_SIZE;
+	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
+	down_write(&ctx->mm->mmap_sem);
+	info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 
+				  PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE,
+				  0);
+	up_write(&ctx->mm->mmap_sem);
+	if (IS_ERR((void *)info->mmap_base)) {
+		printk("mmap err: %ld\n", -info->mmap_base);
+		info->mmap_size = 0;
+		aio_free_ring(ctx);
+		return -EAGAIN;
+	}
+	dprintk("mmap address: 0x%08lx\n", info->mmap_base);
+	info->kvec = map_user_kvec(READ, info->mmap_base, info->mmap_size);
+	if (unlikely(IS_ERR(info->kvec))) {
+		info->kvec = NULL;
+		aio_free_ring(ctx);
+		return -EAGAIN;
+	}
+
+	if (unlikely(info->kvec->nr != nr_pages))
+		BUG();
+
+	for (i=0; i<nr_pages; i++) {
+		if (unlikely(info->kvec->veclet[i].offset))
+			BUG();
+		info->ring_pages[i] = info->kvec->veclet[i].page;
+		//printk("[%d] %p -> %p\n", i, info->kvec->veclet[i].page,
+		//	info->pages[i]);
+	}
+
+
+	ctx->user_id = info->mmap_base;
+
+	info->nr = nr_reqs;		/* trusted copy */
+
+	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+	ring->nr = nr_reqs;	/* user copy */
+	ring->id = ctx->user_id;
+	kunmap_atomic(ring, KM_USER0);
+
+	return 0;
+}
+
+/* aio_ring_event: returns a pointer to the event at the given index from
+ * kmap_atomic(, km).  Release the pointer with put_aio_ring_event();
+ */
+static inline struct io_event *aio_ring_event(struct aio_ring_info *info, int nr, enum km_type km)
+{
+	struct io_event *events;
+#define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
+#define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
+
+	if (nr < AIO_EVENTS_FIRST_PAGE) {
+		struct aio_ring *ring;
+		ring = kmap_atomic(info->ring_pages[0], km);
+		return &ring->io_events[nr];
+	}
+	nr -= AIO_EVENTS_FIRST_PAGE;
+
+	events = kmap_atomic(info->ring_pages[1 + nr / AIO_EVENTS_PER_PAGE], km);
+
+	return events + (nr % AIO_EVENTS_PER_PAGE);
+}
+
+static inline void put_aio_ring_event(struct io_event *event, enum km_type km)
+{
+	void *p = (void *)((unsigned long)event & PAGE_MASK);
+	kunmap_atomic(p, km);
+}
+
+/* ioctx_alloc
+ *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
+ */
+static struct kioctx *ioctx_alloc(unsigned nr_reqs)
+{
+	struct kioctx *ctx;
+	unsigned i;
+
+	/* Prevent overflows */
+	if ((nr_reqs > (0x10000000U / sizeof(struct io_event))) ||
+	    (nr_reqs > (0x10000000U / sizeof(struct kiocb)))) {
+		pr_debug("ENOMEM: nr_reqs too high\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (nr_reqs > aio_max_nr)
+		return ERR_PTR(-EAGAIN);
+
+	ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->max_reqs = nr_reqs;
+	ctx->mm = current->mm;
+	atomic_inc(&ctx->mm->mm_count);
+
+	atomic_set(&ctx->users, 1);
+	spin_lock_init(&ctx->lock);
+	spin_lock_init(&ctx->ring_info.ring_lock);
+	init_waitqueue_head(&ctx->wait);
+
+	INIT_LIST_HEAD(&ctx->free_reqs);
+	INIT_LIST_HEAD(&ctx->active_reqs);
+
+	if (aio_setup_ring(ctx) < 0)
+		goto out_freectx;
+
+	/* Allocate nr_reqs iocbs for io.  Free iocbs are on the 
+	 * ctx->free_reqs list.  When active they migrate to the 
+	 * active_reqs list.  During completion and cancellation 
+	 * the request may temporarily not be on any list.
+	 */
+	for (i=0; i<nr_reqs; i++) {
+		struct kiocb *iocb = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+		if (!iocb)
+			goto out_freering;
+		memset(iocb, 0, sizeof(*iocb));
+		iocb->key = i;
+		iocb->users = 0;
+		list_add(&iocb->list, &ctx->free_reqs);
+	}
+
+	/* now link into global list.  kludge.  FIXME */
+	br_write_lock(BR_AIO_REQ_LOCK);			
+	if (unlikely(aio_nr + ctx->max_reqs > aio_max_nr))
+		goto out_cleanup;
+	aio_nr += ctx->max_reqs;	/* undone by __put_ioctx */
+	ctx->next = current->mm->ioctx_list;
+	current->mm->ioctx_list = ctx;
+	br_write_unlock(BR_AIO_REQ_LOCK);
+
+	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+		ctx, ctx->user_id, current->mm, ctx->ring_info.ring->nr);
+	return ctx;
+
+out_cleanup:
+	br_write_unlock(BR_AIO_REQ_LOCK);
+	ctx->max_reqs = 0;	/* prevent __put_ioctx from sub'ing aio_nr */
+	__put_ioctx(ctx);
+	return ERR_PTR(-EAGAIN);
+
+out_freering:
+	aio_free_ring(ctx);
+	ioctx_free_reqs(ctx);
+out_freectx:
+	kmem_cache_free(kioctx_cachep, ctx);
+	ctx = ERR_PTR(-ENOMEM);
+
+	dprintk("aio: error allocating ioctx %p\n", ctx);
+	return ctx;
+}
+
+/* aio_cancel_all
+ *	Cancels all outstanding aio requests on an aio context.  Used 
+ *	when the processes owning a context have all exited to encourage 
+ *	the rapid destruction of the kioctx.
+ */
+static void aio_cancel_all(struct kioctx *ctx)
+{
+	int (*cancel)(struct kiocb *, struct io_event *);
+	struct io_event res;
+	spin_lock_irq(&ctx->lock);
+	ctx->dead = 1;
+	while (!list_empty(&ctx->active_reqs)) {
+		struct list_head *pos = ctx->active_reqs.next;
+		struct kiocb *iocb = list_kiocb(pos);
+		list_del_init(&iocb->list);
+		cancel = iocb->cancel;
+		if (cancel)
+			iocb->users++;
+		spin_unlock_irq(&ctx->lock);
+		if (cancel)
+			cancel(iocb, &res);
+		spin_lock_irq(&ctx->lock);
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+void wait_for_all_aios(struct kioctx *ctx)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	if (!ctx->reqs_active)
+		return;
+
+	add_wait_queue(&ctx->wait, &wait);
+	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	while (ctx->reqs_active) {
+		dprintk("ctx->reqs_active = %d\n", ctx->reqs_active);
+		schedule();
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	}
+	set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(&ctx->wait, &wait);
+}
+
+/* exit_aio: called when the last user of mm goes away.  At this point, 
+ * there is no way for any new requests to be submited or any of the 
+ * io_* syscalls to be called on the context.  However, there may be 
+ * outstanding requests which hold references to the context; as they 
+ * go away, they will call put_ioctx and release any pinned memory
+ * associated with the request (held via struct page * references).
+ */
+void exit_aio(struct mm_struct *mm)
+{
+	struct kioctx *ctx = mm->ioctx_list;
+	mm->ioctx_list = NULL;
+	while (ctx) {
+		struct kioctx *next = ctx->next;
+		ctx->next = NULL;
+		aio_cancel_all(ctx);
+
+		wait_for_all_aios(ctx);
+
+		if (1 != atomic_read(&ctx->users))
+			dprintk(KERN_DEBUG
+				"exit_aio:ioctx still alive: %d %d %d\n",
+				atomic_read(&ctx->users), ctx->dead,
+				ctx->reqs_active);
+		put_ioctx(ctx);
+		ctx = next;
+	}
+}
+
+/* __put_ioctx
+ *	Called when the last user of an aio context has gone away,
+ *	and the struct needs to be freed.
+ */
+void __put_ioctx(struct kioctx *ctx)
+{
+	unsigned nr_reqs = ctx->max_reqs;
+
+	if (unlikely(ctx->reqs_active))
+		BUG();
+
+	aio_free_ring(ctx);
+	mmdrop(ctx->mm);
+	ctx->mm = NULL;
+	pr_debug("__put_ioctx: freeing %p\n", ctx);
+	ioctx_free_reqs(ctx);
+	kmem_cache_free(kioctx_cachep, ctx);
+
+	br_write_lock(BR_AIO_REQ_LOCK);
+	aio_nr -= nr_reqs;
+	br_write_unlock(BR_AIO_REQ_LOCK);
+}
+
+/* aio_get_req
+ *	Allocate a slot for an aio request.  Increments the users count
+ * of the kioctx so that the kioctx stays around until all requests are
+ * complete.  Returns -EAGAIN if no requests are free.
+ */
+static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx));
+static struct kiocb *__aio_get_req(struct kioctx *ctx)
+{
+	struct kiocb *req = NULL;
+	struct aio_ring *ring;
+
+	/* Use cmpxchg instead of spin_lock? */
+	spin_lock_irq(&ctx->lock);
+	ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
+	if (likely(!list_empty(&ctx->free_reqs) &&
+	    (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)))) {
+		req = list_kiocb(ctx->free_reqs.next);
+		list_del(&req->list);
+		list_add(&req->list, &ctx->active_reqs);
+		ctx->reqs_active++;
+		req->user_obj = NULL;
+		get_ioctx(ctx);
+
+		if (unlikely(req->ctx != NULL))
+			BUG();
+		req->ctx = ctx;
+		if (unlikely(req->users))
+			BUG();
+		req->users = 1;
+	}
+	kunmap_atomic(ring, KM_USER0);
+	spin_unlock_irq(&ctx->lock);
+
+	return req;
+}
+
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+{
+	struct kiocb *req;
+	/* Handle a potential starvation case -- should be exceedingly rare as 
+	 * requests will be stuck on fput_head only if the aio_fput_routine is 
+	 * delayed and the requests were the last user of the struct file.
+	 */
+	req = __aio_get_req(ctx);
+	if (unlikely(NULL == req)) {
+		aio_fput_routine(NULL);
+		req = __aio_get_req(ctx);
+	}
+	return req;
+}
+
+static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+	req->ctx = NULL;
+	req->filp = NULL;
+	req->user_obj = NULL;
+	ctx->reqs_active--;
+	list_add(&req->list, &ctx->free_reqs);
+
+	if (unlikely(!ctx->reqs_active && ctx->dead))
+		wake_up(&ctx->wait);
+}
+
+static void aio_fput_routine(void *data)
+{
+	spin_lock_irq(&fput_lock);
+	while (likely(!list_empty(&fput_head))) {
+		struct kiocb *req = list_kiocb(fput_head.next);
+		struct kioctx *ctx = req->ctx;
+
+		list_del(&req->list);
+		spin_unlock_irq(&fput_lock);
+
+		/* Complete the fput */
+		__fput(req->filp);
+
+		/* Link the iocb into the context's free list */
+		spin_lock_irq(&ctx->lock);
+		really_put_req(ctx, req);
+		spin_unlock_irq(&ctx->lock);
+
+		put_ioctx(ctx);
+		spin_lock_irq(&fput_lock);
+	}
+	spin_unlock_irq(&fput_lock);
+}
+
+/* __aio_put_req
+ *	Returns true if this put was the last user of the request.
+ */
+static inline int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+	dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
+		req, atomic_read(&req->filp->f_count));
+
+	req->users --;
+	if (unlikely(req->users < 0))
+		BUG();
+	if (likely(req->users))
+		return 0;
+	list_del(&req->list);		/* remove from active_reqs */
+	req->cancel = NULL;
+
+	/* Must be done under the lock to serialise against cancellation.
+	 * Call this aio_fput as it duplicates fput via the fput_tqueue.
+	 */
+	if (unlikely(atomic_dec_and_test(&req->filp->f_count))) {
+		get_ioctx(ctx);
+		spin_lock(&fput_lock);
+		list_add(&req->list, &fput_head);
+		spin_unlock(&fput_lock);
+		schedule_task(&fput_tqueue);
+	} else
+		really_put_req(ctx, req);
+	return 1;
+}
+
+/* aio_put_req
+ *	Returns true if this put was the last user of the kiocb,
+ *	false if the request is still in use.
+ */
+int aio_put_req(struct kiocb *req)
+{
+	struct kioctx *ctx = req->ctx;
+	int ret;
+	spin_lock_irq(&ctx->lock);
+	ret = __aio_put_req(ctx, req);
+	spin_unlock_irq(&ctx->lock);
+	if (ret)
+		put_ioctx(ctx);
+	return ret;
+}
+
+/*	Lookup an ioctx id.  ioctx_list is lockless for reads.
+ *	FIXME: this is O(n) and is only suitable for development.
+ */
+static inline struct kioctx *lookup_ioctx(unsigned long ctx_id)
+{
+	struct kioctx *ioctx;
+	struct mm_struct *mm;
+
+	mm = current->mm;
+	br_read_lock(BR_AIO_REQ_LOCK);
+	for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
+		if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
+			get_ioctx(ioctx);
+			break;
+		}
+	br_read_unlock(BR_AIO_REQ_LOCK);
+
+	return ioctx;
+}
+
+/* aio_complete
+ *	Called when the io request on the given iocb is complete.
+ *	Returns true if this is the last user of the request.  The 
+ *	only other user of the request can be the cancellation code.
+ */
+int aio_complete(struct kiocb *iocb, long res, long res2)
+{
+	struct kioctx	*ctx = iocb->ctx;
+	struct aio_ring_info	*info = &ctx->ring_info;
+	struct aio_ring	*ring;
+	struct io_event	*event;
+	unsigned long	flags;
+	unsigned long	tail;
+	int		ret;
+
+	/* add a completion event to the ring buffer.
+	 * must be done holding ctx->lock to prevent
+	 * other code from messing with the tail
+	 * pointer since we might be called from irq
+	 * context.
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+	tail = info->tail;
+	event = aio_ring_event(info, tail, KM_IRQ0);
+	tail = (tail + 1) % info->nr;
+
+	event->obj = (u64)(unsigned long)iocb->user_obj;
+	event->data = iocb->user_data;
+	event->res = res;
+	event->res2 = res2;
+
+	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
+		ctx, tail, iocb, iocb->user_obj, iocb->user_data, res, res2);
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	wmb();
+
+	info->tail = tail;
+	ring->tail = tail;
+
+	mb();
+	if (!ring->woke)
+		ring->woke = 1;
+
+	put_aio_ring_event(event, KM_IRQ0);
+	kunmap_atomic(ring, KM_IRQ1);
+
+	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+
+	/* everything turned out well, dispose of the aiocb. */
+	ret = __aio_put_req(ctx, iocb);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+
+	if (ret)
+		put_ioctx(ctx);
+
+	return ret;
+}
+
+/* aio_read_evt
+ *	Pull an event off of the ioctx's event ring.  Returns the number of 
+ *	events fetched (0 or 1 ;-)
+ *	FIXME: make this use cmpxchg.
+ *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+ */
+static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+{
+	struct aio_ring_info *info = &ioctx->ring_info;
+	struct aio_ring *ring;
+	unsigned long head;
+	int ret = 0;
+
+	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
+		 (unsigned long)ring->head, (unsigned long)ring->tail,
+		 (unsigned long)ring->nr);
+	barrier();
+	if (ring->head == ring->tail)
+		goto out;
+
+	spin_lock(&info->ring_lock);
+
+	head = ring->head % info->nr;
+	if (head != ring->tail) {
+		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+		*ent = *evp;
+		head = (head + 1) % info->nr;
+		barrier();
+		ring->head = head;
+		ret = 1;
+		put_aio_ring_event(evp, KM_USER1);
+	}
+	spin_unlock(&info->ring_lock);
+
+out:
+	kunmap_atomic(ring, KM_USER0);
+	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
+		 (unsigned long)ring->head, (unsigned long)ring->tail);
+	return ret;
+}
+
+struct timeout {
+	struct timer_list	timer;
+	int			timed_out;
+	struct task_struct      *tsk;
+};
+
+static void timeout_func(unsigned long data)
+{
+	struct timeout *to = (struct timeout *)data;
+
+	to->timed_out = 1;
+	wake_up_process(to->tsk);
+}
+
+static inline void init_timeout(struct timeout *to)
+{
+	init_timer(&to->timer);
+	to->timer.data = (unsigned long)to;
+	to->timer.function = timeout_func;
+	to->timed_out = 0;
+	to->tsk = current;
+}
+
+static inline void set_timeout(long start_jiffies, struct timeout *to,
+			       const struct timespec *ts)
+{
+	unsigned long how_long;
+
+	if (ts->tv_sec < 0 || (!ts->tv_sec && !ts->tv_nsec)) {
+		to->timed_out = 1;
+		return;
+	}
+
+	how_long = ts->tv_sec * HZ;
+#define HZ_NS (1000000000 / HZ)
+	how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS;
+	
+	to->timer.expires = jiffies + how_long;
+	add_timer(&to->timer);
+}
+
+static inline void clear_timeout(struct timeout *to)
+{
+	del_timer_sync(&to->timer);
+}
+
+static int read_events(struct kioctx *ctx,
+			long min_nr, long nr,
+			struct io_event *event,
+			struct timespec *timeout)
+{
+	long			start_jiffies = jiffies;
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	int			ret;
+	int			i = 0;
+	struct io_event		ent;
+	struct timeout		to;
+
+	/* needed to zero any padding within an entry (there shouldn't be 
+	 * any, but C is fun!
+	 */
+	memset(&ent, 0, sizeof(ent));
+	ret = 0;
+
+	while (likely(i < nr)) {
+		ret = aio_read_evt(ctx, &ent);
+		if (unlikely(ret <= 0))
+			break;
+
+		dprintk("read event: %Lx %Lx %Lx %Lx\n",
+			ent.data, ent.obj, ent.res, ent.res2);
+
+		/* Could we split the check in two? */
+		ret = -EFAULT;
+		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+			dprintk("aio: lost an event due to EFAULT.\n");
+			break;
+		}
+		ret = 0;
+
+		/* Good, event copied to userland, update counts. */
+		event ++;
+		i ++;
+	}
+
+	if (min_nr <= i)
+		return i;
+	if (ret)
+		return ret;
+
+	/* End fast path */
+
+	if (timeout) {
+		struct timespec	ts;
+		ret = -EFAULT;
+		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+			goto out;
+
+		init_timeout(&to);
+		set_timeout(start_jiffies, &to, &ts);
+	} else {
+	        to.timed_out = 0;
+	}
+
+	while (likely(i < nr)) {
+		add_wait_queue_exclusive(&ctx->wait, &wait);
+		do {
+			set_task_state(tsk, TASK_INTERRUPTIBLE);
+
+			ret = aio_read_evt(ctx, &ent);
+			if (ret)
+				break;
+			if (min_nr <= i)
+				break;
+			ret = 0;
+			if (to.timed_out)	/* Only check after read evt */
+				break;
+			schedule();
+			if (signal_pending(tsk)) {
+				ret = -EINTR;
+				break;
+			}
+			/*ret = aio_read_evt(ctx, &ent);*/
+		} while (1) ;
+
+		__set_task_state(tsk, TASK_RUNNING);
+		remove_wait_queue(&ctx->wait, &wait);
+
+		if (unlikely(ret <= 0))
+			break;
+
+		ret = -EFAULT;
+		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+			dprintk("aio: lost an event due to EFAULT.\n");
+			break;
+		}
+
+		/* Good, event copied to userland, update counts. */
+		event ++;
+		i ++;
+	}
+
+	if (timeout)
+		clear_timeout(&to);
+out:
+	return i ? i : ret;
+}
+
+/* Take an ioctx and remove it from the list of ioctx's.  Protects 
+ * against races with itself via ->dead.
+ */
+static void io_destroy(struct kioctx *ioctx)
+{
+	struct kioctx **tmp;
+	int was_dead;
+
+	/* delete the entry from the list is someone else hasn't already */
+	br_write_lock(BR_AIO_REQ_LOCK);
+	was_dead = ioctx->dead;
+	ioctx->dead = 1;
+	for (tmp = &current->mm->ioctx_list; *tmp && *tmp != ioctx;
+	     tmp = &(*tmp)->next)
+		;
+	if (*tmp)
+		*tmp = ioctx->next;
+	br_write_unlock(BR_AIO_REQ_LOCK);
+
+	dprintk("aio_release(%p)\n", ioctx);
+	if (likely(!was_dead))
+		put_ioctx(ioctx);	/* twice for the list */
+
+	aio_cancel_all(ioctx);
+	wait_for_all_aios(ioctx);
+	put_ioctx(ioctx);	/* once for the lookup */
+}
+
+/* sys_io_setup:
+ *	Create an aio_context capable of receiving at least nr_events.
+ *	ctxp must not point to an aio_context that already exists, and
+ *	must be initialized to 0 prior to the call.  On successful
+ *	creation of the aio_context, *ctxp is filled in with the resulting 
+ *	handle.  May fail with -EINVAL if *ctxp is not initialized,
+ *	if the specified nr_events exceeds internal limits.  May fail 
+ *	with -EAGAIN if the specified nr_events exceeds the user's limit 
+ *	of available events.  May fail with -ENOMEM if insufficient kernel
+ *	resources are available.  May fail with -EFAULT if an invalid
+ *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
+ *	implemented.
+ */
+asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp)
+{
+	struct kioctx *ioctx = NULL;
+	unsigned long ctx;
+	long ret;
+
+	ret = get_user(ctx, ctxp);
+	if (unlikely(ret))
+		goto out;
+
+	ret = -EINVAL;
+	if (unlikely(ctx || !nr_reqs || (int)nr_reqs < 0)) {
+		pr_debug("EINVAL: io_setup: ctx or nr_reqs > max\n");
+		goto out;
+	}
+
+	ioctx = ioctx_alloc(nr_reqs);
+	ret = PTR_ERR(ioctx);
+	if (!IS_ERR(ioctx)) {
+		ret = put_user(ioctx->user_id, ctxp);
+		if (!ret)
+			return 0;
+		io_destroy(ioctx);
+	}
+
+out:
+	return ret;
+}
+
+/* sys_io_destroy:
+ *	Destroy the aio_context specified.  May cancel any outstanding 
+ *	AIOs and block on completion.  Will fail with -ENOSYS if not
+ *	implemented.  May fail with -EFAULT if the context pointed to
+ *	is invalid.
+ */
+asmlinkage long sys_io_destroy(aio_context_t ctx)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx);
+	if (likely(NULL != ioctx)) {
+		io_destroy(ioctx);
+		return 0;
+	}
+	pr_debug("EINVAL: io_destroy: invalid context id\n");
+	return -EINVAL;
+}
+
+static inline int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
+				struct iocb *iocb)
+{
+	ssize_t (*op)(struct file *, struct kiocb *, struct iocb *);
+	struct kiocb *req;
+	struct file *file;
+	ssize_t ret;
+	char *buf;
+
+	/* enforce forwards compatibility on users */
+	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 ||
+		     iocb->aio_reserved3)) {
+		pr_debug("EINVAL: io_submit: reserve field set\n");
+		return -EINVAL;
+	}
+
+	/* prevent overflows */
+	if (unlikely(
+	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+	    ((ssize_t)iocb->aio_nbytes < 0)
+	   )) {
+		pr_debug("EINVAL: io_submit: overflow check\n");
+		return -EINVAL;
+	}
+
+	file = fget(iocb->aio_fildes);
+	if (unlikely(!file))
+		return -EBADF;
+
+	req = aio_get_req(ctx);
+	if (unlikely(!req)) {
+		fput(file);
+		return -EAGAIN;
+	}
+
+	req->filp = file;
+	iocb->aio_key = req->key;
+	ret = put_user(iocb->aio_key, &user_iocb->aio_key);
+	if (unlikely(ret)) {
+		dprintk("EFAULT: aio_key\n");
+		goto out_put_req;
+	}
+
+	req->user_obj = user_iocb;
+	req->user_data = iocb->aio_data;
+	req->buf = iocb->aio_buf;
+	req->pos = iocb->aio_offset;
+	req->size = iocb->aio_nbytes;
+	req->nr_transferred = 0;
+
+	buf = (char *)(unsigned long)iocb->aio_buf;
+
+	switch (iocb->aio_lio_opcode) {
+	case IOCB_CMD_PREAD:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_READ)))
+			goto out_put_req;
+		ret = -EFAULT;
+		if (unlikely(!access_ok(VERIFY_WRITE, buf, iocb->aio_nbytes)))
+			goto out_put_req;
+		op = file->f_op->aio_read;
+		break;
+	case IOCB_CMD_PWRITE:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_WRITE)))
+			goto out_put_req;
+		ret = -EFAULT;
+		if (unlikely(!access_ok(VERIFY_READ, buf, iocb->aio_nbytes)))
+			goto out_put_req;
+		op = file->f_op->aio_write;
+		break;
+	case IOCB_CMD_FDSYNC:
+	case IOCB_CMD_FSYNC:
+		op = file->f_op->aio_fsync;
+		break;
+	default:
+		dprintk("EINVAL: io_submit: no operation %d provided by aio\n",
+			iocb->aio_lio_opcode);
+		ret = -EINVAL;
+		goto out_put_req;
+	}
+
+	if (unlikely(!op)) {
+		dprintk("EINVAL: io_submit: no operation %d provided by lowlevel\n",
+			iocb->aio_lio_opcode);
+		ret = -EINVAL;
+		goto out_put_req;
+	}
+
+	ret = op(file, req, iocb);
+	if (unlikely(ret)) {
+		/* A completion event was sent, so 
+		 * submit is a success. */
+		pr_debug("io_submit: op returned %ld\n", ret);
+		aio_complete(req, ret, 0);
+	}
+	return 0;
+
+out_put_req:
+	aio_put_req(req);
+	return ret;
+}
+
+/* sys_io_submit:
+ *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *	the number of iocbs queued.  May return -EINVAL if the aio_context
+ *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *	*iocbpp[0] is not properly initialized, if the operation specified
+ *	is invalid for the file descriptor in the iocb.  May fail with
+ *	-EFAULT if any of the data structures point to invalid data.  May
+ *	fail with -EBADF if the file descriptor specified in the first
+ *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *	fail with -ENOSYS if not implemented.
+ */
+asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
+			      struct iocb **iocbpp)
+{
+	struct kioctx *ctx;
+	long ret = 0;
+	int i;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
+		return -EFAULT;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx)) {
+		pr_debug("EINVAL: io_submit: invalid context id\n");
+		return -EINVAL;
+	}
+
+	for (i=0; i<nr; i++) {
+		struct iocb *user_iocb, tmp;
+
+		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = io_submit_one(ctx, user_iocb, &tmp);
+		if (ret)
+			break;
+	}
+
+	put_ioctx(ctx);
+	return i ? i : ret;
+}
+
+static void generic_aio_next_chunk(void *_iocb)
+{
+	int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t);
+	struct kiocb *iocb = _iocb;
+	int rw = iocb->this_size;
+	unsigned long buf = iocb->buf;
+	kvec_cb_t cb;
+	ssize_t res;
+
+	iocb->this_size = iocb->size - iocb->nr_transferred;
+	if (iocb->this_size > aio_max_size)
+		iocb->this_size = aio_max_size;
+
+	buf += iocb->nr_transferred;
+	cb.vec = mm_map_user_kvec(iocb->ctx->mm, rw, buf, iocb->this_size);
+	cb.fn = (rw == READ) ? generic_aio_complete_read
+			     : generic_aio_complete_write;
+	cb.data = iocb;
+
+	dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec);
+	if (unlikely(IS_ERR(cb.vec)))
+		goto done;
+
+	kvec_op = (rw == READ) ? iocb->filp->f_op->kvec_read
+			       : iocb->filp->f_op->kvec_write;
+	dprintk("submit: %d %d %d\n", iocb->this_size, iocb->nr_transferred, iocb->size);
+	res = kvec_op(iocb->filp, cb, iocb->this_size,
+		      iocb->pos + iocb->nr_transferred);
+	if (!res) {
+		dprintk("submit okay\n");
+		return;
+	}
+	dprintk("submit failed: %d\n", res);
+	
+	cb.fn(cb.data, cb.vec, res);
+	return;
+
+done:
+	if (unlikely(!iocb->nr_transferred))
+		BUG();
+	aio_complete(iocb, iocb->nr_transferred, 0);
+}
+
+static void generic_aio_complete_rw(int rw, void *_iocb, struct kvec *vec, ssize_t res)
+{
+	struct kiocb *iocb = _iocb;
+
+	unmap_kvec(vec, rw == READ);
+	free_kvec(vec);
+
+	if (res > 0)
+		iocb->nr_transferred += res;
+
+	/* Was this chunk successful?  Is there more left to transfer? */
+	if (res == iocb->this_size && iocb->nr_transferred < iocb->size) {
+		/* We may be in irq context, so queue processing in 
+		 * process context.
+		 */
+		iocb->this_size = rw;
+		INIT_TQUEUE(&iocb->u.tq, generic_aio_next_chunk, iocb);
+		schedule_task(&iocb->u.tq);
+		return;
+	}
+
+	aio_complete(iocb, iocb->nr_transferred ? iocb->nr_transferred : res,
+		     0);
+}
+
+static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res)
+{
+	generic_aio_complete_rw(READ, _iocb, vec, res);
+}
+
+static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res)
+{
+	generic_aio_complete_rw(WRITE, _iocb, vec, res);
+}
+
+ssize_t generic_aio_rw(int rw, struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size)
+{
+	int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t);
+	unsigned long buf = iocb->aio_buf;
+	size_t size = iocb->aio_nbytes;
+	size_t	nr_read = 0;
+	loff_t pos = iocb->aio_offset;
+	kvec_cb_t cb;
+	ssize_t res;
+
+#if 0
+	if (likely(NULL != file->f_op->new_read)) {
+		nr_read = file->f_op->new_read(file, (void *)buf, size,
+					       &pos, F_ATOMIC);
+		dprintk("from new_read: nr_read: %ld\n", (long)nr_read);
+		if ((-EAGAIN == nr_read) || (-EWOULDBLOCKIO == nr_read))
+			nr_read = 0;
+		else if ((nr_read >= min_size) || (nr_read < 0)) {
+			dprintk("returning nr_read: %ld\n", (long)nr_read);
+			return nr_read;
+		}
+	}
+	dprintk("nr_read: %ld\n", (long)nr_read);
+#endif
+
+	req->nr_transferred = nr_read;
+	size -= nr_read;
+	if (size > aio_max_size)
+		/* We have to split up the request.  Pin the mm
+		 * struct for further use with map_user_kvec later.
+		 */
+		size = aio_max_size;
+	else
+		req->buf = 0;
+
+	req->this_size = size;
+
+	buf += nr_read;
+	cb.vec = map_user_kvec(rw, buf, size);
+	cb.fn = (rw == READ) ? generic_aio_complete_read
+			     : generic_aio_complete_write;
+	cb.data = req;
+
+	dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec);
+	if (IS_ERR(cb.vec))
+		return nr_read ? nr_read : PTR_ERR(cb.vec);
+
+	kvec_op = (rw == READ) ? file->f_op->kvec_read : file->f_op->kvec_write;
+
+	res = kvec_op(file, cb, size, pos);
+	if (unlikely(res != 0)) {
+		/* If the first chunk was successful, we have to run
+		 * the callback to attempt the rest of the io.
+		 */
+		if (res == size && req->buf) {
+			cb.fn(cb.data, cb.vec, res);
+			return 0;
+		}
+
+		unmap_kvec(cb.vec, rw == READ);
+		free_kvec(cb.vec);
+		if (nr_read) {
+			if (res < 0)
+				res = 0;
+			res += nr_read;
+		}
+	}
+	return res;
+}
+
+ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+	return generic_aio_rw(READ, file, req, iocb, iocb->aio_nbytes);  
+}
+
+ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size)
+{
+	return generic_aio_rw(WRITE, file, req, iocb, 1);
+#if 0
+	unsigned long buf = iocb.aio_buf;
+	size_t size = iocb.aio_nbytes;
+	loff_t pos = iocb.aio_offset;
+	ssize_t	nr_written = 0;
+	kvec_cb_t cb;
+	long res;
+#if 0
+	if (likely(NULL != file->f_op->new_write)) {
+		nr_written = file->f_op->new_write(file, (void *)buf, size,
+					       &pos, F_ATOMIC);
+		pr_debug("generic_aio_write: new_write: %ld\n", (long)nr_written);
+		if (-EAGAIN == nr_written)
+			nr_written = 0;
+		if ((nr_written >= min_size) || (nr_written < 0))
+			return nr_written;
+	}
+#endif
+
+	req->nr_transferred = nr_written;
+	size -= nr_written;
+	if (size > aio_max_size)
+		size = aio_max_size;
+	req->this_size = size;
+	buf += nr_written;
+	cb.vec = map_user_kvec(WRITE, buf, size);
+	cb.fn = generic_aio_complete_write;
+	cb.data = req;
+
+	if (IS_ERR(cb.vec)) {
+		pr_debug("generic_aio_write: map_user_kvec: %ld\n", PTR_ERR(cb.vec));
+		return nr_written ? nr_written : PTR_ERR(cb.vec);
+	}
+
+	res = file->f_op->kvec_write(file, cb, size, iocb.aio_offset);
+	pr_debug("generic_aio_write: kvec_write: %ld\n", res);
+	if (unlikely(res != 0)) {
+		unmap_kvec(cb.vec, 0);
+		free_kvec(cb.vec);
+		if (nr_written) {
+			if (res < 0)
+				res = 0;
+			res += nr_written;
+		}
+	}
+	return res;
+#endif
+}
+
+ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+	return generic_aio_write(file, req, iocb, iocb->aio_nbytes);	
+}
+
+/* lookup_kiocb
+ *	Finds a given iocb for cancellation.
+ *	MUST be called with ctx->lock held.
+ */
+struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb *iocb, u32 key)
+{
+	struct list_head *pos;
+	/* TODO: use a hash or array, this sucks. */
+	list_for_each(pos, &ctx->free_reqs) {
+		struct kiocb *kiocb = list_kiocb(pos);
+		if (kiocb->user_obj == iocb && kiocb->key == key)
+			return kiocb;
+	}
+	return NULL;
+}
+
+/* sys_io_cancel:
+ *	Attempts to cancel an iocb previously passed to io_submit.  If
+ *	the operation is successfully cancelled, the resulting event is
+ *	copied into the memory pointed to by result without being placed
+ *	into the completion queue and 0 is returned.  May fail with
+ *	-EFAULT if any of the data structures pointed to are invalid.
+ *	May fail with -EINVAL if aio_context specified by ctx_id is
+ *	invalid.  May fail with -EAGAIN if the iocb specified was not
+ *	cancelled.  Will fail with -ENOSYS if not implemented.
+ */
+asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb,
+			      struct io_event *result)
+{
+	int (*cancel)(struct kiocb *iocb, struct io_event *res);
+	struct kioctx *ctx;
+	struct kiocb *kiocb;
+	u32 key;
+	int ret;
+
+	ret = get_user(key, &iocb->aio_key);
+	if (unlikely(ret))
+		return -EFAULT;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx))
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	ret = -EAGAIN;
+	kiocb = lookup_kiocb(ctx, iocb, key);
+	if (kiocb && kiocb->cancel) {
+		cancel = kiocb->cancel;
+		kiocb->users ++;
+	} else
+		cancel = NULL;
+	spin_unlock_irq(&ctx->lock);
+
+	if (NULL != cancel) {
+		struct io_event tmp;
+		ret = cancel(kiocb, &tmp);
+		if (!ret) {
+			/* Cancellation succeeded -- copy the result
+			 * into the user's buffer.
+			 */
+			if (copy_to_user(result, &tmp, sizeof(tmp)))
+				ret = -EFAULT;
+		}
+	} else
+		dprintk(KERN_DEBUG "iocb has no cancel operation\n");
+
+	put_ioctx(ctx);
+
+	return ret;
+}
+
+/* io_getevents:
+ *	Attempts to read at least min_nr events and up to nr events from
+ *	the completion queue for the aio_context specified by ctx_id.  May
+ *	fail with -EINVAL if ctx_id is invalid, if min_nr is out of range,
+ *	if nr is out of range, if when is out of range.  May fail with
+ *	-EFAULT if any of the memory specified to is invalid.  May return
+ *	0 or < min_nr if no events are available and the timeout specified
+ *	by when	has elapsed, where when == NULL specifies an infinite
+ *	timeout.  Note that the timeout pointed to by when is relative and
+ *	will be updated if not NULL and the operation blocks.  Will fail
+ *	with -ENOSYS if not implemented.
+ */
+asmlinkage long sys_io_getevents(aio_context_t ctx_id,
+				 long min_nr,
+				 long nr,
+				 struct io_event *events,
+				 struct timespec *timeout)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx_id);
+	long ret = -EINVAL;
+
+	if (unlikely(min_nr > nr || min_nr < 0 || nr < 0))
+		return ret;
+
+	if (likely(NULL != ioctx)) {
+		ret = read_events(ioctx, min_nr, nr, events, timeout);
+		put_ioctx(ioctx);
+	}
+
+	return ret;
+}
+
+__initcall(aio_setup);
+
+EXPORT_SYMBOL(generic_file_kvec_read);
+EXPORT_SYMBOL(generic_file_aio_read);
+EXPORT_SYMBOL(generic_file_kvec_write);
+EXPORT_SYMBOL(generic_file_aio_write);
diff -urNp x-ref/fs/buffer.c x/fs/buffer.c
--- x-ref/fs/buffer.c	2003-02-14 05:22:27.000000000 +0100
+++ x/fs/buffer.c	2003-02-14 05:22:30.000000000 +0100
@@ -3067,3 +3067,220 @@ static int __init bdflush_init(void)
 
 module_init(bdflush_init)
 
+/* async kio interface */
+struct brw_cb {
+	kvec_cb_t		cb;
+	atomic_t		io_count;
+	int			nr;
+	struct buffer_head	*bh[1];
+};
+
+static inline void brw_cb_put(struct brw_cb *brw_cb)
+{
+	if (atomic_dec_and_test(&brw_cb->io_count)) {
+		ssize_t res = 0, err = 0;
+		int nr;
+
+		/* Walk the buffer heads associated with this kiobuf
+		 * checking for errors and freeing them as we go.
+		 */
+		for (nr=0; nr < brw_cb->nr; nr++) {
+			struct buffer_head *bh = brw_cb->bh[nr];
+			if (!err && buffer_uptodate(bh))
+				res += bh->b_size;
+			else
+				err = -EIO;
+			kmem_cache_free(bh_cachep, bh);
+		}
+
+		if (!res)
+			res = err;
+
+		brw_cb->cb.fn(brw_cb->cb.data, brw_cb->cb.vec, res);
+
+		kfree(brw_cb);
+	}
+}
+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate)
+{
+	struct brw_cb *brw_cb;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	brw_cb = bh->b_private;
+	unlock_buffer(bh);
+
+	brw_cb_put(brw_cb);
+}
+
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift)
+{
+	struct kvec	*vec = cb.vec;
+	struct kveclet	*veclet;
+	int		err;
+	int		length;
+	unsigned	sector_size = 1 << sector_shift;
+	int		i;
+
+	struct brw_cb	*brw_cb;
+
+	if (!vec->nr)
+		BUG();
+
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	length = 0;
+	for (veclet=vec->veclet, i=0; i < vec->nr; i++,veclet++) {
+		length += veclet->length;
+		if ((veclet->offset & (sector_size-1)) ||
+		    (veclet->length & (sector_size-1))) {
+			printk("brw_kiovec_async: tuple[%d]->offset=0x%x length=0x%x sector_size: 0x%x\n", i, veclet->offset, veclet->length, sector_size);
+			return -EINVAL;
+		}
+	}
+
+	if (length < (blocks << sector_shift))
+		BUG();
+
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	err = 0;
+
+	if (!blocks) {
+		printk("brw_kiovec_async: !i\n");
+		return -EINVAL;
+	}
+
+	/* FIXME: tie into userbeans here */
+	brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL);
+	if (!brw_cb)
+		return -ENOMEM;
+
+	brw_cb->cb = cb;
+	brw_cb->nr = 0;
+
+	/* This is ugly.  FIXME. */
+	for (i=0, veclet=vec->veclet; i<vec->nr; i++,veclet++) {
+		struct page *page = veclet->page;
+		unsigned offset = veclet->offset;
+		unsigned length = veclet->length;
+
+		if (!page)
+			BUG();
+
+		while (length > 0) {
+			struct buffer_head *tmp;
+			tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO);
+			err = -ENOMEM;
+			if (!tmp)
+				goto error;
+
+			tmp->b_dev = B_FREE;
+			tmp->b_size = sector_size;
+			set_bh_page(tmp, page, offset);
+			tmp->b_this_page = tmp;
+
+			init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
+			tmp->b_dev = dev;
+			tmp->b_blocknr = blknr++;
+			tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock)
+					| (1 << BH_Req);
+			tmp->b_private = brw_cb;
+
+			if (rw == WRITE) {
+				set_bit(BH_Uptodate, &tmp->b_state);
+				clear_bit(BH_Dirty, &tmp->b_state);
+			}
+
+			brw_cb->bh[brw_cb->nr++] = tmp;
+			length -= sector_size;
+			offset += sector_size;
+
+			if (offset >= PAGE_SIZE) {
+				offset = 0;
+				break;
+			}
+
+			if (brw_cb->nr >= blocks)
+				goto submit;
+		} /* End of block loop */
+	} /* End of page loop */		
+
+submit:
+	atomic_set(&brw_cb->io_count, brw_cb->nr+1);
+	/* okay, we've setup all our io requests, now fire them off! */
+	for (i=0; i<brw_cb->nr; i++) 
+		submit_bh(rw, brw_cb->bh[i]);
+	brw_cb_put(brw_cb);
+
+	return 0;
+
+error:
+	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
+	if (brw_cb) {
+		/* We got an error allocating the bh'es.  Just free the current
+		   buffer_heads and exit. */
+		for (i=0; i<brw_cb->nr; i++)
+			kmem_cache_free(bh_cachep, brw_cb->bh[i]);
+		kfree(brw_cb);
+	}
+
+	return err;
+}
+#if 0
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
+		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int i;
+	int transferred = 0;
+	int err = 0;
+
+	if (!nr)
+		return 0;
+
+	/* queue up and trigger the io */
+	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
+	if (err)
+		goto out;
+
+	/* wait on the last iovec first -- it's more likely to finish last */
+	for (i=nr; --i >= 0; )
+		kiobuf_wait_for_io(iovec[i]);
+
+	run_task_queue(&tq_disk);
+
+	/* okay, how much data actually got through? */
+	for (i=0; i<nr; i++) {
+		if (iovec[i]->errno) {
+			if (!err)
+				err = iovec[i]->errno;
+			break;
+		}
+		transferred += iovec[i]->length;
+	}
+
+out:
+	return transferred ? transferred : err;
+}
+#endif
diff -urNp x-ref/fs/exec.c x/fs/exec.c
--- x-ref/fs/exec.c	2003-02-14 05:22:15.000000000 +0100
+++ x/fs/exec.c	2003-02-14 05:22:30.000000000 +0100
@@ -425,6 +425,7 @@ static int exec_mmap(void)
 	old_mm = current->mm;
 	if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
 		mm_release();
+		exit_aio(old_mm);
 		exit_mmap(old_mm);
 		return 0;
 	}
diff -urNp x-ref/fs/ext2/file.c x/fs/ext2/file.c
--- x-ref/fs/ext2/file.c	2002-01-22 18:54:59.000000000 +0100
+++ x/fs/ext2/file.c	2003-02-14 05:22:30.000000000 +0100
@@ -40,6 +40,8 @@ static int ext2_release_file (struct ino
  */
 struct file_operations ext2_file_operations = {
 	llseek:		generic_file_llseek,
+	kvec_read:	generic_file_kvec_read,
+	kvec_write:	generic_file_kvec_write,
 	read:		generic_file_read,
 	write:		generic_file_write,
 	ioctl:		ext2_ioctl,
@@ -47,6 +49,8 @@ struct file_operations ext2_file_operati
 	open:		generic_file_open,
 	release:	ext2_release_file,
 	fsync:		ext2_sync_file,
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
 };
 
 struct inode_operations ext2_file_inode_operations = {
diff -urNp x-ref/fs/ext3/file.c x/fs/ext3/file.c
--- x-ref/fs/ext3/file.c	2002-11-29 02:23:15.000000000 +0100
+++ x/fs/ext3/file.c	2003-02-14 05:22:30.000000000 +0100
@@ -111,6 +111,8 @@ force_commit:
 
 struct file_operations ext3_file_operations = {
 	llseek:		generic_file_llseek,	/* BKL held */
+	kvec_read:	generic_file_kvec_read,
+	kvec_write:	generic_file_kvec_write,	/* FIXME: attributes */
 	read:		generic_file_read,	/* BKL not held.  Don't need */
 	write:		ext3_file_write,	/* BKL not held.  Don't need */
 	ioctl:		ext3_ioctl,		/* BKL held */
@@ -118,6 +120,8 @@ struct file_operations ext3_file_operati
 	open:		ext3_open_file,		/* BKL not held.  Don't need */
 	release:	ext3_release_file,	/* BKL not held.  Don't need */
 	fsync:		ext3_sync_file,		/* BKL held */
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
 };
 
 struct inode_operations ext3_file_inode_operations = {
diff -urNp x-ref/fs/file_table.c x/fs/file_table.c
--- x-ref/fs/file_table.c	2002-11-29 02:23:15.000000000 +0100
+++ x/fs/file_table.c	2003-02-14 05:22:30.000000000 +0100
@@ -97,33 +97,37 @@ int init_private_file(struct file *filp,
 		return 0;
 }
 
-void fput(struct file * file)
+inline void __fput(struct file * file)
 {
 	struct dentry * dentry = file->f_dentry;
 	struct vfsmount * mnt = file->f_vfsmnt;
 	struct inode * inode = dentry->d_inode;
 
-	if (atomic_dec_and_test(&file->f_count)) {
-		locks_remove_flock(file);
+	locks_remove_flock(file);
 
-		if (file->f_iobuf)
-			free_kiovec(1, &file->f_iobuf);
+	if (file->f_iobuf)
+		free_kiovec(1, &file->f_iobuf);
 
-		if (file->f_op && file->f_op->release)
-			file->f_op->release(inode, file);
-		fops_put(file->f_op);
-		if (file->f_mode & FMODE_WRITE)
-			put_write_access(inode);
-		file_list_lock();
-		file->f_dentry = NULL;
-		file->f_vfsmnt = NULL;
-		list_del(&file->f_list);
-		list_add(&file->f_list, &free_list);
-		files_stat.nr_free_files++;
-		file_list_unlock();
-		dput(dentry);
-		mntput(mnt);
-	}
+	if (file->f_op && file->f_op->release)
+		file->f_op->release(inode, file);
+	fops_put(file->f_op);
+	if (file->f_mode & FMODE_WRITE)
+		put_write_access(inode);
+	file_list_lock();
+	file->f_dentry = NULL;
+	file->f_vfsmnt = NULL;
+	list_del(&file->f_list);
+	list_add(&file->f_list, &free_list);
+	files_stat.nr_free_files++;
+	file_list_unlock();
+	dput(dentry);
+	mntput(mnt);
+}
+
+void fput(struct file * file)
+{
+	if (atomic_dec_and_test(&file->f_count))
+		__fput(file);
 }
 
 struct file * fget(unsigned int fd)
diff -urNp x-ref/fs/jfs/file.c x/fs/jfs/file.c
--- x-ref/fs/jfs/file.c	2003-01-29 06:14:11.000000000 +0100
+++ x/fs/jfs/file.c	2003-02-14 05:22:30.000000000 +0100
@@ -102,4 +102,8 @@ struct file_operations jfs_file_operatio
 	.mmap		= generic_file_mmap,
 	.fsync		= jfs_fsync,
 	.release	= jfs_release,
+	.kvec_read	= generic_file_kvec_read,
+	.kvec_write	= generic_file_kvec_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
 };
diff -urNp x-ref/fs/nfs/file.c x/fs/nfs/file.c
--- x-ref/fs/nfs/file.c	2003-02-14 05:22:16.000000000 +0100
+++ x/fs/nfs/file.c	2003-02-14 05:22:30.000000000 +0100
@@ -40,9 +40,13 @@ static ssize_t nfs_file_read(struct file
 static ssize_t nfs_file_write(struct file *, const char *, size_t, loff_t *);
 static int  nfs_file_flush(struct file *);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos);
+static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos);
 
 struct file_operations nfs_file_operations = {
 	llseek:		generic_file_llseek,
+	kvec_read:	nfs_kvec_read,
+	kvec_write:	nfs_kvec_write,
 	read:		nfs_file_read,
 	write:		nfs_file_write,
 	mmap:		nfs_file_mmap,
@@ -51,6 +55,8 @@ struct file_operations nfs_file_operatio
 	release:	nfs_release,
 	fsync:		nfs_fsync,
 	lock:		nfs_lock,
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
 };
 
 struct inode_operations nfs_file_inode_operations = {
@@ -89,6 +95,28 @@ nfs_file_flush(struct file *file)
 	return status;
 }
 
+static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct inode * inode = dentry->d_inode;
+	int ret;
+	ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (!ret)
+		return generic_file_kvec_write(file, cb, count, pos);
+	return ret;
+}
+
+static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct inode * inode = dentry->d_inode;
+	int ret;
+	ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (!ret)
+		return generic_file_kvec_read(file, cb, count, pos);
+	return ret;
+}
+
 static ssize_t
 nfs_file_read(struct file * file, char * buf, size_t count, loff_t *ppos)
 {
diff -urNp x-ref/fs/reiserfs/file.c x/fs/reiserfs/file.c
--- x-ref/fs/reiserfs/file.c	2002-11-29 02:23:16.000000000 +0100
+++ x/fs/reiserfs/file.c	2003-02-14 05:22:30.000000000 +0100
@@ -136,6 +136,10 @@ struct file_operations reiserfs_file_ope
     mmap:	generic_file_mmap,
     release:	reiserfs_file_release,
     fsync:	reiserfs_sync_file,
+    kvec_read:  generic_file_kvec_read,
+    kvec_write: generic_file_kvec_write,
+    aio_read:   generic_file_aio_read,
+    aio_write:  generic_file_aio_write,
 };
 
 
diff -urNp x-ref/include/asm-i386/kmap_types.h x/include/asm-i386/kmap_types.h
--- x-ref/include/asm-i386/kmap_types.h	2002-11-29 02:23:16.000000000 +0100
+++ x/include/asm-i386/kmap_types.h	2003-02-14 05:22:30.000000000 +0100
@@ -8,6 +8,8 @@ enum km_type {
 	KM_USER0,
 	KM_USER1,
 	KM_BH_IRQ,
+	KM_IRQ0,
+	KM_IRQ1,
 	KM_TYPE_NR
 };
 
diff -urNp x-ref/include/asm-i386/semaphore.h x/include/asm-i386/semaphore.h
--- x-ref/include/asm-i386/semaphore.h	2002-11-29 02:23:16.000000000 +0100
+++ x/include/asm-i386/semaphore.h	2003-02-14 05:22:30.000000000 +0100
@@ -131,6 +131,31 @@ static inline void down(struct semaphore
 		:"memory");
 }
 
+/* Returns 0 if we acquired the semaphore, 1 if it was queued. */
+struct worktodo;
+static inline int wtd_down(struct worktodo *wtd, struct semaphore *sem)
+{
+	int ret = 0;
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+
+	__asm__ __volatile__(
+		"# atomic down operation\n\t"
+		LOCK "decl %0\n\t"     /* --sem->count */
+		"js 2f\n"
+		"1:\n"
+		LOCK_SECTION_START("")
+		"2:\tcall __wtd_down_failed\n\t"
+		"movl $1,%1\n\t"
+		"jmp 1b\n"
+		LOCK_SECTION_END
+		:"=m" (sem->count), "=r" (ret)
+		:"c" (sem), "1" (ret), "d" (wtd)
+		:"memory");
+	return ret;
+}
+
 /*
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
diff -urNp x-ref/include/asm-ppc/kmap_types.h x/include/asm-ppc/kmap_types.h
--- x-ref/include/asm-ppc/kmap_types.h	2002-11-29 02:23:17.000000000 +0100
+++ x/include/asm-ppc/kmap_types.h	2003-02-14 05:22:30.000000000 +0100
@@ -12,6 +12,8 @@ enum km_type {
 	KM_USER0,
 	KM_USER1,
 	KM_BH_IRQ,
+	KM_IRQ0,
+	KM_IRQ1,
 	KM_TYPE_NR
 };
 
diff -urNp x-ref/include/asm-x86_64/kmap_types.h x/include/asm-x86_64/kmap_types.h
--- x-ref/include/asm-x86_64/kmap_types.h	2003-01-29 06:14:23.000000000 +0100
+++ x/include/asm-x86_64/kmap_types.h	1970-01-01 01:00:00.000000000 +0100
@@ -1,13 +0,0 @@
-#ifndef _ASM_KMAP_TYPES_H
-#define _ASM_KMAP_TYPES_H
-
-enum km_type {
-	KM_BOUNCE_READ,
-	KM_SKB_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_TYPE_NR
-};
-
-#endif
diff -urNp x-ref/include/asm-x86_64/semaphore.h x/include/asm-x86_64/semaphore.h
--- x-ref/include/asm-x86_64/semaphore.h	2003-01-29 06:14:23.000000000 +0100
+++ x/include/asm-x86_64/semaphore.h	2003-02-14 05:22:30.000000000 +0100
@@ -133,6 +133,31 @@ static inline void down(struct semaphore
 		:"memory");
 }
 
+/* Returns 0 if we acquired the semaphore, 1 if it was queued. */
+struct worktodo;
+static inline int wtd_down(struct worktodo *wtd, struct semaphore *sem)
+{
+	int ret = 0;
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+
+	__asm__ __volatile__(
+		"# atomic down operation\n\t"
+		LOCK "decl %0\n\t"     /* --sem->count */
+		"js 2f\n"
+		"1:\n"
+		LOCK_SECTION_START("")
+		"2:\tcall __wtd_down_failed\n\t"
+		"movl $1,%1\n\t"
+		"jmp 1b\n"
+		LOCK_SECTION_END
+		:"=m" (sem->count), "=r" (ret)
+		:"D" (sem), "1" (ret), "S" (wtd)
+		:"memory");
+	return ret;
+}
+
 /*
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
@@ -215,3 +240,4 @@ static inline void up(struct semaphore *
 }
 #endif /* __KERNEL__ */
 #endif
+
diff -urNp x-ref/include/linux/aio.h x/include/linux/aio.h
--- x-ref/include/linux/aio.h	1970-01-01 01:00:00.000000000 +0100
+++ x/include/linux/aio.h	2003-02-14 05:22:30.000000000 +0100
@@ -0,0 +1,127 @@
+#ifndef __LINUX__AIO_H
+#define __LINUX__AIO_H
+
+#include <linux/tqueue.h>
+#include <linux/kiovec.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/aio_abi.h>
+
+#define AIO_MAXSEGS		4
+#define AIO_KIOGRP_NR_ATOMIC	8
+
+struct kioctx;
+
+/* Notes on cancelling a kiocb:
+ *	If a kiocb is cancelled, aio_complete may return 0 to indicate 
+ *	that cancel has not yet disposed of the kiocb.  All cancel 
+ *	operations *must* call aio_put_req to dispose of the kiocb 
+ *	to guard against races with the completion code.
+ */
+#define KIOCB_C_CANCELLED	0x01
+#define KIOCB_C_COMPLETE	0x02
+
+struct kiocb {
+	struct list_head	list;
+	struct file	*filp;
+	struct kioctx	*ctx;
+	void		*user_obj;
+	__u64		user_data;
+	loff_t		pos;
+	unsigned long	buf;
+	size_t		nr_transferred;	/* used for chunking */
+	size_t		size;
+	size_t		this_size;
+	unsigned	key;		/* id of this request */
+	int		(*cancel)(struct kiocb *, struct io_event *);
+	void		*data;		/* for use by the the async op */
+	int		users;
+	union {
+		struct tq_struct	tq;	/* argh. */
+		struct list_head	list;
+	} u;
+};
+
+struct aio_ring {
+	unsigned	id;	/* kernel internal index number */
+	unsigned	nr;	/* number of io_events */
+	unsigned	head;
+	unsigned	tail;
+
+	unsigned	woke;	/* set when a wakeup was sent */
+	unsigned	pad[3];
+
+
+	struct io_event		io_events[0];
+}; /* 128 bytes + ring size */
+
+#define aio_ring_avail(info, ring)	(((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr)
+
+#define AIO_RING_PAGES	8
+struct aio_ring_info {
+	//struct file		*mmap_file;
+	struct kvec		*kvec;
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+
+	struct page		**ring_pages;
+	spinlock_t		ring_lock;
+	unsigned		nr_pages;
+
+	unsigned		nr, tail;
+
+	struct page		*internal_pages[AIO_RING_PAGES];
+};
+
+struct kioctx {
+	atomic_t		users;
+	int			dead;
+	struct mm_struct	*mm;
+
+	/* This needs improving */
+	unsigned long		user_id;
+	struct kioctx		*next;
+
+	wait_queue_head_t	wait;
+
+	spinlock_t		lock;
+
+	int			reqs_active;
+	struct list_head	free_reqs;
+	struct list_head	active_reqs;	/* used for cancellation */
+
+	unsigned		max_reqs;
+
+	struct aio_ring_info	ring_info;
+};
+
+/* prototypes */
+extern unsigned aio_max_size;
+
+extern int FASTCALL(aio_put_req(struct kiocb *iocb));
+extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2));
+extern void FASTCALL(__put_ioctx(struct kioctx *ctx));
+struct mm_struct;
+extern void FASTCALL(exit_aio(struct mm_struct *mm));
+
+#define get_ioctx(kioctx)	do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0)
+#define put_ioctx(kioctx)	do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0)
+
+#include <linux/aio_abi.h>
+
+static inline struct kiocb *list_kiocb(struct list_head *h)
+{
+	return list_entry(h, struct kiocb, list);
+}
+
+struct file;
+extern ssize_t generic_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size);
+extern ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size);
+extern ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+extern ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+
+/* for sysctl: */
+extern unsigned aio_nr, aio_max_nr, aio_max_size, aio_max_pinned;
+
+#endif /* __LINUX__AIO_H */
diff -urNp x-ref/include/linux/aio_abi.h x/include/linux/aio_abi.h
--- x-ref/include/linux/aio_abi.h	1970-01-01 01:00:00.000000000 +0100
+++ x/include/linux/aio_abi.h	2003-02-14 05:22:30.000000000 +0100
@@ -0,0 +1,91 @@
+/* linux/aio_abi.h
+ *
+ * Copyright 2000,2001,2002 Red Hat.
+ *
+ * Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ * Distribute under the terms of the GPLv2 (see ../../COPYING) or under 
+ * the following terms.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation is hereby granted, provided that the above copyright
+ * notice appears in all copies.  This software is provided without any
+ * warranty, express or implied.  Red Hat makes no representations about
+ * the suitability of this software for any purpose.
+ *
+ * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+ * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND
+ * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ */
+#ifndef __LINUX__AIO_ABI_H
+#define __LINUX__AIO_ABI_H
+
+#include <asm/byteorder.h>
+
+typedef unsigned long	aio_context_t;
+
+enum {
+	IOCB_CMD_PREAD = 0,
+	IOCB_CMD_PWRITE = 1,
+	IOCB_CMD_FSYNC = 2,
+	IOCB_CMD_FDSYNC = 3,
+	/* These two are experimental.
+	 * IOCB_CMD_PREADX = 4,
+	 * IOCB_CMD_POLL = 5,
+	 */
+	IOCB_CMD_NOOP = 6,
+};
+
+/* read() from /dev/aio returns these structures. */
+struct io_event {
+	__u64		data;		/* the data field from the iocb */
+	__u64		obj;		/* what iocb this event came from */
+	__s64		res;		/* result code for this event */
+	__s64		res2;		/* secondary result */
+};
+
+#if defined(__LITTLE_ENDIAN)
+#define PADDED(x,y)	x, y
+#elif defined(__BIG_ENDIAN)
+#define PADDED(x,y)	y, x
+#else
+#error edit for your odd byteorder.
+#endif
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland.  its up to libraries to do the
+ * proper padding and aio_error abstraction
+ */
+
+struct iocb {
+	/* these are internal to the kernel/libc. */
+	__u64	aio_data;	/* data to be returned in event's data */
+	__u32	PADDED(aio_key, aio_reserved1);
+				/* the kernel sets aio_key to the req # */
+
+	/* common fields */
+	__u16	aio_lio_opcode;	/* see IOCB_CMD_ above */
+	__s16	aio_reqprio;
+	__u32	aio_fildes;
+
+	__u64	aio_buf;
+	__u64	aio_nbytes;
+	__s64	aio_offset;
+
+	/* extra parameters */
+	__u64	aio_reserved2;	/* TODO: use this for a (struct sigevent *) */
+	__u64	aio_reserved3;
+}; /* 64 bytes */
+
+#undef IFBIG
+#undef IFLITTLE
+
+#endif /* __LINUX__AIO_ABI_H */
diff -urNp x-ref/include/linux/brlock.h x/include/linux/brlock.h
--- x-ref/include/linux/brlock.h	2003-02-14 05:22:25.000000000 +0100
+++ x/include/linux/brlock.h	2003-02-14 05:22:30.000000000 +0100
@@ -34,6 +34,7 @@
 enum brlock_indices {
 	BR_GLOBALIRQ_LOCK,
 	BR_NETPROTO_LOCK,
+	BR_AIO_REQ_LOCK,
 
 	__BR_END
 };
diff -urNp x-ref/include/linux/file.h x/include/linux/file.h
--- x-ref/include/linux/file.h	2002-08-09 14:52:29.000000000 +0200
+++ x/include/linux/file.h	2003-02-14 05:22:30.000000000 +0100
@@ -5,6 +5,7 @@
 #ifndef __LINUX_FILE_H
 #define __LINUX_FILE_H
 
+extern void FASTCALL(__fput(struct file *));
 extern void FASTCALL(fput(struct file *));
 extern struct file * FASTCALL(fget(unsigned int fd));
  
diff -urNp x-ref/include/linux/fs.h x/include/linux/fs.h
--- x-ref/include/linux/fs.h	2003-02-14 05:22:26.000000000 +0100
+++ x/include/linux/fs.h	2003-02-14 05:22:30.000000000 +0100
@@ -198,6 +198,8 @@ extern int leases_enable, dir_notify_ena
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
 
 #ifdef __KERNEL__
+#include <linux/aio.h>
+#include <linux/aio_abi.h>
 
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
@@ -950,6 +952,15 @@ struct file_operations {
 	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+
+	/* in-kernel fully async api */
+	int (*kvec_read)(struct file *, kvec_cb_t, size_t, loff_t);
+	int (*kvec_write)(struct file *, kvec_cb_t, size_t, loff_t);
+
+	/* userland aio ops */
+	ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb *);
+	ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb *);
+	ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb *);
 };
 
 struct inode_operations {
@@ -1604,6 +1615,8 @@ extern ssize_t generic_file_write_nolock
 extern void __do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t, int);
 #define do_generic_file_read(filp, ppos, desc, actor) __do_generic_file_read(filp, ppos, desc, actor, 0)
 #define do_generic_file_read_atomic(filp, ppos, desc, actor) __do_generic_file_read(filp, ppos, desc, actor, 1)
+extern int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos);
+extern int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
 extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
diff -urNp x-ref/include/linux/highmem.h x/include/linux/highmem.h
--- x-ref/include/linux/highmem.h	2003-02-14 05:22:26.000000000 +0100
+++ x/include/linux/highmem.h	2003-02-14 05:22:30.000000000 +0100
@@ -82,6 +82,18 @@ static inline void *kmap(struct page *pa
 #define bh_kmap_irq(bh, flags)		((bh)->b_data)
 #define bh_kunmap_irq(bh, flags)	do { *(flags) = 0; } while (0)
 
+enum km_type {
+	KM_BOUNCE_READ,
+	KM_SKB_SUNRPC_DATA,
+	KM_SKB_DATA_SOFTIRQ,
+	KM_USER0,
+	KM_USER1,
+	KM_BH_IRQ,
+	KM_IRQ0,
+	KM_IRQ1,
+	KM_TYPE_NR
+};
+
 #endif /* CONFIG_HIGHMEM */
 
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
diff -urNp x-ref/include/linux/kiovec.h x/include/linux/kiovec.h
--- x-ref/include/linux/kiovec.h	1970-01-01 01:00:00.000000000 +0100
+++ x/include/linux/kiovec.h	2003-02-14 05:22:30.000000000 +0100
@@ -0,0 +1,155 @@
+#ifndef __LINUX__KIOVEC_H
+#define __LINUX__KIOVEC_H
+
+struct page;
+struct mm_struct;
+#include <linux/list.h>
+
+struct kveclet {
+	struct page	*page;
+	unsigned	offset;
+	unsigned	length;
+};
+
+struct kvec {
+	unsigned	max_nr;
+	unsigned	nr;
+	struct kveclet	veclet[0];
+};
+
+struct kvec_cb {
+	struct kvec	*vec;
+	void		(*fn)(void *data, struct kvec *vec, ssize_t res);
+	void		*data;
+};
+
+struct kvec_cb_list {
+	struct list_head	list;
+	struct kvec_cb		cb;
+};
+
+#ifndef _LINUX_TYPES_H
+#include <linux/types.h>
+#endif
+#ifndef _LINUX_KDEV_T_H
+#include <linux/kdev_t.h>
+#endif
+#ifdef CONFIG_HIGHMEM
+#include <asm/kmap_types.h>
+#endif
+
+extern struct kvec *FASTCALL(map_user_kvec(int rw, unsigned long va, size_t len));
+extern struct kvec *FASTCALL(mm_map_user_kvec(struct mm_struct *, int rw,
+					      unsigned long va, size_t len));
+extern void FASTCALL(unmap_kvec(struct kvec *, int dirtied));
+extern void FASTCALL(free_kvec(struct kvec *));
+
+/* brw_kvec_async:
+ *	Performs direct io to/from disk into cb.vec.  Count is the number
+ *	of sectors to read, sector_shift is the blocksize (which must be
+ *	compatible with the kernel's current idea of the device's sector
+ *	size) in log2.  blknr is the starting sector offset on dev.
+ *
+ */
+extern int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned count,
+			  unsigned long blknr, int sector_shift);
+
+/* Memory copy helpers usage:
+ * void foo(... struct kveclet *veclet...)
+ *
+ *	struct kvec_dst	dst;
+ *
+ *	kvec_dst_init(&dst);				-- resets type
+ *	kvec_dst_set(&dst, veclet);			-- set target & clear offset
+ *	kvec_dst_map(&dst);				-- activates kmap
+ *	for (...)
+ *		memcpy_to_kvec_dst(&dst, data, size);	-- each copy appends
+ *	kvec_dst_unmap(&dst);				-- releases kmap
+ *
+ * Note that scheduling is not permitted between kvec_dst_map() and
+ * kvec_dst_unmap().  This is because internally the routines make use
+ * of an atomic kmap.
+ */
+struct kvec_dst {
+	char		*addr;
+	char		*dst;
+	struct kveclet	*let;
+	int		space;
+	int		offset;
+#ifdef CONFIG_HIGHMEM
+	enum km_type	type;
+#endif
+};
+
+
+#define kvec_dst_set(Xdst, Xlet)					\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		struct kveclet *_let = (Xlet);				\
+		_dst->let = _let;					\
+		_dst->space = _let->length;				\
+		_dst->offset = 0;					\
+	} while(0)
+
+#ifdef CONFIG_HIGHMEM
+#define kvec_dst_map(Xdst)						\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		struct kveclet *_let = _dst->let;			\
+		_dst->dst = _dst->addr = kmap_atomic(_let->page, _dst->type);\
+		_dst->dst += _let->offset + _dst->offset;		\
+		_dst->space = _let->length - _dst->offset;		\
+		_dst->offset = 0;					\
+	} while(0)
+
+#define kvec_dst_init(Xdst)						\
+	do {								\
+		(Xdst)->space = 0;					\
+		(Xdst)->addr = 0;					\
+		(Xdst)->offset = 0;					\
+		(Xdst)->type = KM_USER0;				\
+	} while(0)
+
+#define	kvec_dst_unmap(Xdst)						\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		kunmap_atomic(_dst->addr, _dst->type);			\
+		_dst->offset = _dst->dst - _dst->addr;			\
+		_dst->offset -= _dst->let->offset;			\
+		_dst->addr = NULL;					\
+	} while(0)
+#else
+#define kvec_dst_map(Xdst)						\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		struct kveclet *_let = _dst->let;			\
+		_dst->dst = _dst->addr = page_address(_let->page);	\
+		_dst->dst += _let->offset + _dst->offset;		\
+		_dst->space = _let->length - _dst->offset;		\
+		_dst->offset = 0;					\
+	} while(0)
+
+#define kvec_dst_init(Xdst)						\
+	do {								\
+		(Xdst)->space = 0;					\
+		(Xdst)->addr = 0;					\
+		(Xdst)->offset = 0;					\
+	} while(0)
+
+#define	kvec_dst_unmap(Xdst)						\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		_dst->offset = _dst->dst - _dst->addr;			\
+		_dst->offset -= _dst->let->offset;			\
+		_dst->addr = NULL;					\
+	} while(0)
+#endif
+
+extern void FASTCALL(memcpy_to_kvec_dst(struct kvec_dst *dst,
+					const char *from, long len));
+extern void FASTCALL(memcpy_from_kvec_dst(char *to,
+					  struct kvec_dst *from, long len));
+extern int FASTCALL(copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len));
+
+
+#endif /* __LINUX__KIOVEC_H */
diff -urNp x-ref/include/linux/list.h x/include/linux/list.h
--- x-ref/include/linux/list.h	2003-02-14 05:22:15.000000000 +0100
+++ x/include/linux/list.h	2003-02-14 05:22:30.000000000 +0100
@@ -224,6 +224,8 @@ static inline void list_splice_init(list
 	     pos = list_entry(pos->member.next, typeof(*pos), member),	\
 		     prefetch(pos->member.next))
 
+#define list_first(head)	(((head)->next != (head)) ? (head)->next: (struct list_head *) 0)
+
 #endif /* __KERNEL__ || _LVM_H_INCLUDE */
 
 #endif
diff -urNp x-ref/include/linux/mm.h x/include/linux/mm.h
--- x-ref/include/linux/mm.h	2003-02-14 05:22:26.000000000 +0100
+++ x/include/linux/mm.h	2003-02-14 05:22:30.000000000 +0100
@@ -712,9 +712,9 @@ static inline int expand_stack(struct vm
 }
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
-extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
-					     struct vm_area_struct **pprev);
+extern struct vm_area_struct * FASTCALL(find_vma(struct mm_struct * mm, unsigned long addr));
+extern struct vm_area_struct * FASTCALL(find_vma_prev(struct mm_struct * mm, unsigned long addr,
+						      struct vm_area_struct **pprev));
 
 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
    NULL if none.  Assume start_addr < end_addr. */
diff -urNp x-ref/include/linux/pagemap.h x/include/linux/pagemap.h
--- x-ref/include/linux/pagemap.h	2003-02-14 05:22:13.000000000 +0100
+++ x/include/linux/pagemap.h	2003-02-14 05:22:30.000000000 +0100
@@ -88,6 +88,7 @@ extern struct page *find_trylock_page(st
 extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index);
 extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
 extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
+extern wait_queue_head_t *FASTCALL(page_waitqueue(struct page *page));
 
 extern void ___wait_on_page(struct page *);
 
diff -urNp x-ref/include/linux/sched.h x/include/linux/sched.h
--- x-ref/include/linux/sched.h	2003-02-14 05:22:26.000000000 +0100
+++ x/include/linux/sched.h	2003-02-14 05:22:30.000000000 +0100
@@ -223,6 +223,7 @@ struct files_struct {
 
 extern int max_map_count;
 
+struct kioctx;
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	rb_root_t mm_rb;
@@ -251,6 +252,8 @@ struct mm_struct {
 
 	/* Architecture-specific MM context */
 	mm_context_t context;
+
+	struct kioctx	*ioctx_list;
 };
 
 extern int mmlist_nr;
@@ -832,6 +835,7 @@ extern int do_fork(unsigned long, unsign
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
 extern void wait_task_inactive(task_t * p);
diff -urNp x-ref/include/linux/sysctl.h x/include/linux/sysctl.h
--- x-ref/include/linux/sysctl.h	2003-02-14 05:22:25.000000000 +0100
+++ x/include/linux/sysctl.h	2003-02-14 05:22:30.000000000 +0100
@@ -599,8 +599,7 @@ enum {
 /* CTL_PROC names: */
 
 /* CTL_FS names: */
-enum
-{
+enum {
 	FS_NRINODE=1,	/* int:current number of allocated inodes */
 	FS_STATINODE=2,
 	FS_MAXINODE=3,	/* int:maximum number of inodes that can be allocated */
@@ -618,6 +617,10 @@ enum
 	FS_LEASE_TIME=15,	/* int: maximum time to wait for a lease break */
 	FS_DQSTATS=16,	/* dir: disc quota usage statistics */
 	FS_XFS=17,	/* struct: control xfs parameters */
+	FS_AIO_NR=18,		/* int: current number of aio requests */
+	FS_AIO_MAX_NR=19,	/* int: max system wide aio requests */
+	FS_AIO_MAX_SIZE=20,	/* int: max size of read/write chunks */
+	FS_AIO_MAX_PINNED=21,	/* long: max memory pinned (in pages) */
 };
 
 /* /proc/sys/fs/quota/ */
diff -urNp x-ref/include/linux/tqueue.h x/include/linux/tqueue.h
--- x-ref/include/linux/tqueue.h	2002-12-18 23:58:19.000000000 +0100
+++ x/include/linux/tqueue.h	2003-02-14 05:22:30.000000000 +0100
@@ -68,6 +68,9 @@ typedef struct list_head task_queue;
 
 extern task_queue tq_timer, tq_immediate, tq_disk;
 
+/* same as run_task_queue(&tq_disk) but async, from wtd.c */
+extern void async_run_tq_disk(void);
+
 /*
  * To implement your own list of active bottom halfs, use the following
  * two definitions:
diff -urNp x-ref/include/linux/types.h x/include/linux/types.h
--- x-ref/include/linux/types.h	2002-12-18 23:51:41.000000000 +0100
+++ x/include/linux/types.h	2003-02-14 05:22:30.000000000 +0100
@@ -127,4 +127,9 @@ struct ustat {
 	char			f_fpack[6];
 };
 
+/* kernel typedefs -- they belong here. */
+#ifdef __KERNEL__
+typedef struct kvec_cb kvec_cb_t;
+#endif /* __KERNEL__ */
+
 #endif /* _LINUX_TYPES_H */
diff -urNp x-ref/include/linux/wait.h x/include/linux/wait.h
--- x-ref/include/linux/wait.h	2003-02-14 05:22:11.000000000 +0100
+++ x/include/linux/wait.h	2003-02-14 05:22:30.000000000 +0100
@@ -28,17 +28,20 @@
 #define WAITQUEUE_DEBUG 0
 #endif
 
+typedef struct __wait_queue wait_queue_t;
+typedef void (*wait_queue_func_t)(wait_queue_t *wait);
+
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
 	struct task_struct * task;
 	struct list_head task_list;
+	wait_queue_func_t func;
 #if WAITQUEUE_DEBUG
 	long __magic;
 	long __waker;
 #endif
 };
-typedef struct __wait_queue wait_queue_t;
 
 /*
  * 'dual' spinlock architecture. Can be switched between spinlock_t and
@@ -139,6 +142,7 @@ typedef struct __wait_queue_head wait_qu
 #endif
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
+	func:		NULL,						\
 	task:		tsk,						\
 	task_list:	{ NULL, NULL },					\
 			 __WAITQUEUE_DEBUG_INIT(name)}
@@ -176,6 +180,22 @@ static inline void init_waitqueue_entry(
 #endif
 	q->flags = 0;
 	q->task = p;
+	q->func = NULL;
+#if WAITQUEUE_DEBUG
+	q->__magic = (long)&q->__magic;
+#endif
+}
+
+static inline void init_waitqueue_func_entry(wait_queue_t *q,
+					wait_queue_func_t func)
+{
+#if WAITQUEUE_DEBUG
+	if (!q || !p)
+		WQ_BUG();
+#endif
+	q->flags = 0;
+	q->task = NULL;
+	q->func = func;
 #if WAITQUEUE_DEBUG
 	q->__magic = (long)&q->__magic;
 #endif
@@ -233,6 +253,38 @@ static inline void __remove_wait_queue(w
 	list_del(&old->task_list);
 }
 
+#define add_wait_queue_cond(q, wait, cond) \
+	({							\
+		unsigned long flags;				\
+		int _raced = 0;					\
+		wq_write_lock_irqsave(&(q)->lock, flags);	\
+		(wait)->flags = 0;				\
+		__add_wait_queue((q), (wait));			\
+		mb();						\
+		if (!(cond)) {					\
+			_raced = 1;				\
+			__remove_wait_queue((q), (wait));	\
+		}						\
+		wq_write_unlock_irqrestore(&(q)->lock, flags);	\
+		_raced;						\
+	})
+
+#define add_wait_queue_exclusive_cond(q, wait, cond) \
+	({							\
+		unsigned long flags;				\
+		int _raced = 0;					\
+		wq_write_lock_irqsave(&(q)->lock, flags);	\
+		(wait)->flags = WQ_FLAG_EXCLUSIVE;		\
+		__add_wait_queue_tail((q), (wait));		\
+		mb();						\
+		if (!(cond)) {					\
+			_raced = 1;				\
+			__remove_wait_queue((q), (wait));	\
+		}						\
+		wq_write_unlock_irqrestore(&(q)->lock, flags);	\
+		_raced;						\
+	})
+
 #endif /* __KERNEL__ */
 
 #endif
diff -urNp x-ref/include/linux/worktodo.h x/include/linux/worktodo.h
--- x-ref/include/linux/worktodo.h	1970-01-01 01:00:00.000000000 +0100
+++ x/include/linux/worktodo.h	2003-02-14 05:22:30.000000000 +0100
@@ -0,0 +1,76 @@
+/*
+ *	Written by Benjamin LaHaise.
+ *
+ *	Copyright 2000-2001 Red Hat, Inc.
+ *
+ *	#include "gpl.h"
+ *
+ *	Basic design idea from Jeff Merkey.
+ *	Stack based on ideas from Ingo Molnar.
+ */
+#ifndef __LINUX__WORKTODO_H
+#define __LINUX__WORKTODO_H
+
+#ifndef _LINUX_WAIT_H
+#include <linux/wait.h>
+#endif
+#ifndef _LINUX_TQUEUE_H
+#include <linux/tqueue.h>
+#endif
+
+struct wtd_stack {
+	void	(*fn)(void *data);
+	void	*data;
+};
+
+struct worktodo {
+	wait_queue_t		wait;
+	struct tq_struct	tq;
+
+	void			*data;	/* for use by the wtd_ primatives */
+
+	int			sp;
+	struct wtd_stack	stack[3];
+};
+
+/* FIXME NOTE: factor from kernel/context.c */
+#define wtd_init(wtd, routine) do {			\
+	INIT_TQUEUE(&(wtd)->tq, (routine), (wtd));	\
+	(wtd)->data = 0;				\
+	(wtd)->sp = 0;					\
+} while (0)
+
+#define wtd_queue(wtd)	schedule_task(&(wtd)->tq)
+
+#define wtd_push(wtd, action, wtddata)			\
+do {							\
+	(wtd)->stack[(wtd)->sp].fn = (wtd)->tq.routine;	\
+	(wtd)->stack[(wtd)->sp++].data = (wtd)->tq.data;\
+	(wtd)->tq.routine = action;			\
+	(wtd)->tq.data = wtddata;			\
+} while (0)
+
+static inline void wtd_pop(struct worktodo *wtd)
+{
+	if (wtd->sp) {
+		wtd->sp--;
+		wtd->tq.routine = wtd->stack[wtd->sp].fn;
+		wtd->tq.data = wtd->stack[wtd->sp].data;
+	}
+}
+
+#define wtd_set_action(wtd, action, wtddata)	INIT_TQUEUE(&(wtd)->tq, action, wtddata)
+
+struct page;
+struct buffer_head;
+struct semaphore;
+extern int wtd_lock_page(struct worktodo *wtd, struct page *page);
+extern int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh);
+
+#if 0	/* not implemented yet */
+extern int wtd_down(struct worktodo *wtd, struct semaphore *sem);
+extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem);
+extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem);
+#endif
+
+#endif /* __LINUX__WORKTODO_H */
diff -urNp x-ref/kernel/fork.c x/kernel/fork.c
--- x-ref/kernel/fork.c	2003-02-14 05:22:21.000000000 +0100
+++ x/kernel/fork.c	2003-02-14 05:22:30.000000000 +0100
@@ -54,6 +54,16 @@ void add_wait_queue(wait_queue_head_t *q
 	wq_write_unlock_irqrestore(&q->lock, flags);
 }
 
+void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)
+{
+	unsigned long flags;
+
+	wq_write_lock_irqsave(&q->lock, flags);
+	wait->flags = WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+	wq_write_unlock_irqrestore(&q->lock, flags);
+}
+
 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -278,6 +288,7 @@ int mmlist_nr;
 
 static struct mm_struct * mm_init(struct mm_struct * mm)
 {
+	mm->ioctx_list = NULL;
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
@@ -313,6 +324,7 @@ struct mm_struct * mm_alloc(void)
  */
 inline void __mmdrop(struct mm_struct *mm)
 {
+	BUG_ON(mm->ioctx_list);
 	BUG_ON(mm == &init_mm);
 	pgd_free(mm->pgd);
 	check_pgt_cache();
@@ -332,6 +344,7 @@ void mmput(struct mm_struct *mm)
 		list_del(&mm->mmlist);
 		mmlist_nr--;
 		spin_unlock(&mmlist_lock);
+		exit_aio(mm);
 		exit_mmap(mm);
 		mmdrop(mm);
 	}
diff -urNp x-ref/kernel/ksyms.c x/kernel/ksyms.c
--- x-ref/kernel/ksyms.c	2003-02-14 05:22:26.000000000 +0100
+++ x/kernel/ksyms.c	2003-02-14 05:22:30.000000000 +0100
@@ -446,6 +446,13 @@ EXPORT_SYMBOL(unlock_kiovec);
 EXPORT_SYMBOL(brw_kiovec);
 EXPORT_SYMBOL(kiobuf_wait_for_io);
 
+/* kvecs */
+EXPORT_SYMBOL(map_user_kvec);
+EXPORT_SYMBOL(unmap_kvec);
+EXPORT_SYMBOL(free_kvec);
+EXPORT_SYMBOL(memcpy_to_kvec_dst);
+EXPORT_SYMBOL(memcpy_from_kvec_dst);
+
 /* dma handling */
 EXPORT_SYMBOL(request_dma);
 EXPORT_SYMBOL(free_dma);
diff -urNp x-ref/kernel/sched.c x/kernel/sched.c
--- x-ref/kernel/sched.c	2003-02-14 05:22:26.000000000 +0100
+++ x/kernel/sched.c	2003-02-14 05:22:30.000000000 +0100
@@ -976,13 +976,22 @@ void force_cpu_reschedule(int cpu)
  */
 static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync)
 {
-	struct list_head *tmp;
+	struct list_head *tmp, *next;
 	unsigned int state;
 	wait_queue_t *curr;
 	task_t *p;
+	wait_queue_func_t func;
 
-	list_for_each(tmp, &q->task_list) {
+	list_for_each_safe(tmp, next, &q->task_list) {
 		curr = list_entry(tmp, wait_queue_t, task_list);
+		func = curr->func;
+		if (func) {
+			unsigned int flags = curr->flags;
+			func(curr);
+			if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+				break;
+			continue;
+		}
 		p = curr->task;
 		state = p->state;
 		if ((state & mode) && try_to_wake_up(p, sync) &&
diff -urNp x-ref/kernel/sysctl.c x/kernel/sysctl.c
--- x-ref/kernel/sysctl.c	2003-02-14 05:22:26.000000000 +0100
+++ x/kernel/sysctl.c	2003-02-14 05:22:30.000000000 +0100
@@ -32,6 +32,7 @@
 #include <linux/highuid.h>
 #include <linux/swap.h>
 #include <linux/bdf_prm.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 
@@ -305,6 +306,8 @@ static ctl_table proc_table[] = {
 	{0}
 };
 
+extern int user_pinned_pages;
+
 static ctl_table fs_table[] = {
 	{FS_NRINODE, "inode-nr", &inodes_stat, 2*sizeof(int),
 	 0444, NULL, &proc_dointvec},
@@ -328,6 +331,16 @@ static ctl_table fs_table[] = {
 	 sizeof(int), 0644, NULL, &proc_dointvec},
 	{FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int),
 	 0644, NULL, &proc_dointvec},
+	{FS_AIO_NR, "aio-nr", &aio_nr, sizeof(aio_nr),
+	 0444, NULL, &proc_dointvec},
+	{FS_AIO_MAX_NR, "aio-max-nr", &aio_max_nr, sizeof(aio_max_nr),
+	 0644, NULL, &proc_dointvec},
+	{FS_AIO_MAX_SIZE, "aio-max-size", &aio_max_size, sizeof(aio_max_size),
+	 0644, NULL, &proc_dointvec},
+	{FS_AIO_MAX_PINNED, "aio-max-pinned", &aio_max_pinned, sizeof(aio_max_pinned),
+	 0644, NULL, &proc_dointvec},
+	{FS_AIO_MAX_PINNED+1, "aio-pinned", &user_pinned_pages, 4,
+	 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -urNp x-ref/mm/Makefile x/mm/Makefile
--- x-ref/mm/Makefile	2003-02-14 05:22:17.000000000 +0100
+++ x/mm/Makefile	2003-02-14 05:22:43.000000000 +0100
@@ -18,5 +18,6 @@ obj-y	 := memory.o mmap.o filemap.o mpro
 
 obj-$(CONFIG_HIGHMEM) += highmem.o
 obj-$(CONFIG_PROC_MM) += proc_mm.o
+obj-y += wtd.o
 
 include $(TOPDIR)/Rules.make
diff -urNp x-ref/mm/filemap.c x/mm/filemap.c
--- x-ref/mm/filemap.c	2003-02-14 05:22:27.000000000 +0100
+++ x/mm/filemap.c	2003-02-14 05:22:30.000000000 +0100
@@ -29,6 +29,8 @@
 #include <asm/mman.h>
 
 #include <linux/highmem.h>
+#include <linux/worktodo.h>
+#include <linux/iobuf.h>
 
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -770,7 +772,7 @@ static inline wait_queue_head_t * wait_t
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-static inline wait_queue_head_t * page_waitqueue(struct page *page)
+inline wait_queue_head_t * page_waitqueue(struct page *page)
 {
 	pg_data_t * pgdat = page_zone(page)->zone_pgdat;
 	return wait_table_hashfn(page, &pgdat->wait_table);
@@ -1309,10 +1311,17 @@ void __do_generic_file_read(struct file 
 	int reada_ok;
 	int error;
 	int max_readahead = get_max_readahead(inode);
+	loff_t pos;
+
+	pos = *ppos;
+	if (unlikely(pos < 0)) {
+		desc->error = -EINVAL;
+		return;
+	}
 
 	cached_page = NULL;
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
 
 /*
  * If the current position is outside the previous read-ahead window, 
@@ -1360,13 +1369,17 @@ void __do_generic_file_read(struct file 
 
 		end_index = i_size >> PAGE_CACHE_SHIFT;
 			
-		if (index > end_index)
+		if (index > end_index) {
+			desc->error = 0;
 			break;
+		}
 		nr = PAGE_CACHE_SIZE;
 		if (index == end_index) {
 			nr = i_size & ~PAGE_CACHE_MASK;
-			if (nr <= offset)
+			if (nr <= offset) {
+				desc->error = 0;
 				break;
+			}
 		}
 
 		nr = nr - offset;
@@ -3209,3 +3222,698 @@ void __init page_cache_init(unsigned lon
 		panic("Failed to allocate page hash table\n");
 	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
 }
+
+/* address_space_map
+ *	Maps a series of pages from the page cache into the given array.
+ */
+static int address_space_map(struct address_space *as, unsigned long index,
+		int nr, struct page **pages,
+		int *nr_newp, struct page **new_pages)
+{
+	struct page *cached_page = NULL;
+	int nr_new = 0;
+	int ret;
+
+	if (unlikely(nr <= 0)) {
+		*nr_newp = nr_new;
+		return 0;
+	}
+
+	ret = 0;
+
+	spin_lock(&pagecache_lock);
+
+	while (nr > 0) {
+		struct page **hash = page_hash(as, index);
+		struct page *page;
+
+		page = __find_page_nolock(as, index, *hash);
+		if (page) {
+			page_cache_get(page);
+got_page:
+			pages[ret++] = page;
+			index++;
+			nr--;
+			continue;
+		}
+
+		if (cached_page) {
+			__add_to_page_cache(cached_page, as, index, hash);
+
+			spin_unlock(&pagecache_lock);
+			lru_cache_add(cached_page);
+			spin_lock(&pagecache_lock);
+
+			nr_new++;
+			*new_pages++ = page = cached_page;
+			cached_page = NULL;
+			goto got_page;
+		}
+		spin_unlock(&pagecache_lock);
+
+		cached_page = page_cache_alloc(as);
+		if (!cached_page)
+			goto out;
+
+		/* Okay, we now have an allocated page.  Retry
+		 * the search and add. */
+		spin_lock(&pagecache_lock);
+	}
+
+	spin_unlock(&pagecache_lock);
+
+out:
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	*nr_newp = nr_new;
+	return ret ? ret : -ENOMEM;
+}
+
+struct iodesc {
+	struct worktodo	wtd;
+
+	struct page	*good_page;	/* the highest Uptodate page */
+	int		good_idx;
+	int		err;
+	int		did_read;
+	int		rw;
+	loff_t		pos;
+
+	struct page	**pages;
+	struct page	**new_pages;
+	struct page	**cur_pagep;
+	int		nr_pages;
+	int		nr_new_pages;
+
+	struct address_space *as;
+	struct file	*file;
+	kvec_cb_t	cb;
+
+	size_t		size;
+	unsigned long	transferred;
+	unsigned	offset;
+	struct kveclet	*veclet;
+
+	struct kvec_dst	src;
+
+	int		sync;
+	unsigned long	rlimit_fsize;
+
+#define READDESC_NR_DEF	3
+	struct page *def_pages[READDESC_NR_DEF];
+	struct page *def_new_pages[READDESC_NR_DEF];
+};
+
+static void __iodesc_free(struct iodesc *io, int unlock)
+{
+	kvec_cb_t cb;
+	ssize_t res;
+
+	if (unlock) {
+		unsigned i;
+		for (i=0; i<io->nr_pages; i++) {
+			struct page *page = io->pages[i];
+			UnlockPage(page);
+			page_cache_release(page);
+		}
+	} else {
+		unsigned i;
+		for (i=0; i<io->nr_pages; i++)
+			page_cache_release(io->pages[i]);
+	}
+
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+
+	cb = io->cb;
+	res = io->transferred ? io->transferred : io->err;
+	kfree(io);
+
+	cb.fn(cb.data, cb.vec, res);
+}
+
+/* By the time this function is called, all of the pages prior to
+ * the current good_idx have been released appropriately.  The remaining
+ * duties are to release any remaining pages and to honour O_SYNC.
+ */
+static void __iodesc_finish_write(struct iodesc *io)
+{
+	pr_debug("__iodesc_finish_write(%p)\n", io);
+
+	__iodesc_free(io, WRITE == io->rw);
+}
+
+/* This is mostly ripped from generic_file_write */
+static int __iodesc_write_page(struct iodesc *io, struct page *page)
+{
+	char *kaddr = kmap(page);
+	unsigned long bytes;
+	unsigned long offset;
+	long status;
+	int done = 0;
+
+	offset = io->offset;
+	kaddr += offset;
+
+	bytes = PAGE_CACHE_SIZE - offset;
+	if (io->size < bytes)
+		bytes = io->size;
+
+	pr_debug("__iodesc_write_page(%p (%lu), %lu %lu)\n", page, page->index, offset, bytes);
+
+	io->err = io->as->a_ops->prepare_write(io->file, page,
+						offset, offset + bytes);
+	if (unlikely(io->err)) {
+		pr_debug("prepare_write: %d\n", io->err);
+		kunmap(page);
+		return 1;
+	}
+
+	kvec_dst_map(&io->src);
+	memcpy_from_kvec_dst(kaddr, &io->src, bytes);
+	kvec_dst_unmap(&io->src);	/* commit_write may block */
+
+	flush_dcache_page(page);
+	status = io->as->a_ops->commit_write(io->file, page,
+						offset, offset+bytes);
+
+	/* We don't handle short writes */
+	if (status > 0 && status != bytes)
+		done = 1;
+
+	if (!status)
+		status = bytes;
+
+	if (likely(status > 0)) {
+		io->transferred += status;
+		io->size -= status;
+		io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1);
+
+		if (io->offset)
+			done = 1;
+	} else {
+		io->err = status;
+		done = 1;
+	}
+
+	kunmap(page);
+	return done;
+}
+
+void __iodesc_sync_wait_page(void *data)
+{
+	struct iodesc *io = data;
+
+	do {
+		struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers;
+
+		if (!head)
+			continue;
+
+		bh = head;
+		do {
+			if (buffer_locked(bh)) {
+				pr_debug("waiting on bh=%pi io=%p\n", bh, io);
+				if (!wtd_wait_on_buffer(&io->wtd, bh))
+					return;
+			}
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				pr_debug("io err bh=%p (%p)\n", bh, io);
+				io->err = -EIO;
+				break;
+			}
+		} while ((bh = bh->b_this_page) != head);
+	} while (!io->err && ++io->good_idx < io->nr_pages) ;
+
+	pr_debug("finish_write(%p)\n", io);
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_do_write(void *data)
+{
+	struct iodesc *io = data;
+	unsigned i;
+
+	for (i=0; i<io->nr_pages; i++) {
+		if (__iodesc_write_page(io, io->pages[i]))
+			break;
+	}
+
+	up(&io->file->f_dentry->d_inode->i_sem);
+
+	if (io->sync) {
+		io->good_idx = 0;
+
+		pr_debug("writing out pages(%p)\n", io);
+		for (i=0; i<io->nr_pages; i++) {
+			if (io->pages[i]->buffers)
+				writeout_one_page(io->pages[i]);
+		}
+
+		pr_debug("calling __iodesc_sync_wait_page(%p)\n", io);
+		wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io);
+		__iodesc_sync_wait_page(io);
+		return;
+	}
+
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_write_lock_next_page(void *data)
+{
+	struct iodesc *io = data;
+	pr_debug("__iodesc_write_next_page(%p)\n", io);
+
+	while (io->good_idx < io->nr_pages) {
+		io->good_page = io->pages[io->good_idx++];
+		if (io->good_page == *io->cur_pagep)
+			io->cur_pagep++;
+		else {
+			if (!wtd_lock_page(&io->wtd, io->good_page))
+				return;
+		}
+	}
+
+	//Is this faster? __iodesc_do_write(io);
+	wtd_set_action(&io->wtd, __iodesc_do_write, io);
+	wtd_queue(&io->wtd);
+}
+
+static void __generic_file_write_iodesc(struct iodesc *io)
+{
+	struct inode *inode = io->file->f_dentry->d_inode;
+	time_t now = CURRENT_TIME;
+
+	remove_suid(inode);
+	if (inode->i_ctime != now || inode->i_mtime != now) {
+		inode->i_ctime = inode->i_mtime = now;
+		mark_inode_dirty_sync(inode);
+	}
+
+	wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io);
+	io->sync = !!(io->file->f_flags & O_SYNC);
+	io->good_idx = 0;
+	io->cur_pagep = io->new_pages;
+	__iodesc_write_lock_next_page(io);
+}
+
+static void __iodesc_read_finish(struct iodesc *io)
+{
+	struct page **src_pagep;
+	char *dst_addr, *src_addr;
+	int src_off;
+	size_t size;
+	size_t valid;
+
+	struct kveclet *veclet = io->veclet;
+	struct page *dst_page = veclet->page;
+	int dst_len = veclet->length;
+	int dst_off = veclet->offset;
+
+
+	pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx);
+	if (io->good_idx <= 0)
+		goto no_data;
+
+	size = io->size;
+	src_off = io->offset;
+	src_pagep = io->pages;
+	src_addr = kmap(*src_pagep);
+
+	valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT;
+	valid -= src_off;
+	pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off);
+
+	if (valid < size)
+		size = valid;
+
+	dst_addr = kmap(veclet->page);
+
+	while (size > 0) {
+		int this = PAGE_CACHE_SIZE - src_off;
+		if ((PAGE_SIZE - dst_off) < this)
+			this = PAGE_SIZE - dst_off;
+		if (size < this)
+			this = size;
+		pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n",
+			this, src_off, dst_off, dst_len);
+		memcpy(dst_addr + dst_off, src_addr + src_off, this);
+
+		src_off += this;
+		dst_off += this;
+		dst_len -= this;
+		size -= this;
+		io->transferred += this;
+		pr_debug("read_finish: this=%d transferred=%d\n",
+			 this, io->transferred);
+
+		if (size <= 0)
+			break;
+
+		if (dst_len <= 0) {
+			kunmap(dst_page);
+			veclet++;
+			dst_page = veclet->page;
+			dst_off = veclet->offset;
+			dst_len = veclet->length;
+			dst_addr = kmap(dst_page);
+		}
+
+		if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */
+			kunmap(*src_pagep);
+			pr_debug("page(%lu)->count = %d\n",
+				 (*src_pagep)->index,
+				 atomic_read(&(*src_pagep)->count));
+			src_pagep++;
+			src_addr = kmap(*src_pagep);
+			src_off = 0;
+		}
+	}
+	kunmap(dst_page);
+	kunmap(*src_pagep);
+no_data:
+	__iodesc_free(io, 0);
+}
+
+static void __iodesc_make_uptodate(void *data)
+{
+	struct iodesc *io = data;
+	struct page *page = io->good_page;
+	int locked = 1;
+
+	pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index);
+again:
+	while (Page_Uptodate(page)) {
+		pr_debug("page index %lu uptodate\n", page->index);
+		if (locked) {
+			UnlockPage(page);
+			locked = 0;
+		}
+		io->did_read = 0;
+		io->good_idx++;
+		if (io->good_idx >= io->nr_pages) {
+			__iodesc_read_finish(io);
+			return;
+		}
+		page = io->good_page = io->pages[io->good_idx];
+		pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index);
+	}
+
+	if (!locked) {
+		if (!wtd_lock_page(&io->wtd, page))
+			return;
+		locked = 1;
+	}
+
+	if (!io->did_read) {
+		/* We haven't tried reading this page before, give it a go. */
+		pr_debug("attempting to read %lu\n", page->index);
+		io->did_read = 1;
+		if (likely(page->mapping)) {
+			locked = 0;
+			io->err = page->mapping->a_ops->readpage(io->file, page);
+			if (!io->err) {
+				if (Page_Uptodate(page))
+					goto again;
+				if (wtd_lock_page(&io->wtd, page)) {
+					locked = 1;
+					goto again;
+				}
+				return;
+			}
+		} else
+			/* page not mapped, erroring out. */
+			io->err = 0;
+	}
+
+	if (locked)
+		UnlockPage(page);
+
+	/* We've already read this page before.  Set err to EIO and quite */
+	if (!io->err)
+		io->err = -EIO;
+	__iodesc_read_finish(io);
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data);
+
+static void __generic_file_read_iodesc(struct iodesc *io, int mayblock)
+{
+	int (*readpage)(struct file *, struct page *);
+	int i;
+
+	wtd_set_action(&io->wtd, __iodesc_make_uptodate, io);
+	readpage = io->as->a_ops->readpage;
+	for (i=0; i<io->nr_new_pages; i++) {
+		int ret;
+		if (!mayblock) {
+			wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+			wtd_queue(&io->wtd);
+			return;
+		}
+		ret = readpage(io->file, io->new_pages[i]);
+		if (ret)
+			printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret);
+	}
+
+	for (i=0; i<io->nr_pages; i++) {
+		struct page *page = io->pages[i];
+		if (Page_Uptodate(page)) {
+			pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index);
+			continue;
+		}
+
+		if (!mayblock) {
+			wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+			wtd_queue(&io->wtd);
+			return;
+		}
+		/*
+		 * Lock the page and if it is still mapped in the file,
+		 * attempt to read it in.
+		 */
+		if (!TryLockPage(page)) {
+			if (likely(page->mapping)) {
+				int ret = readpage(io->file, page);
+				if (ret)
+					pr_debug("__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret);
+			} else
+				/* page not mapped, truncated! */
+				unlock_page(page);
+		}
+
+		if (!Page_Uptodate(page) && io->good_idx == -1) {
+			pr_debug("first good_idx=%d (%lu)\n", i, page->index);
+			io->good_idx = i;
+			io->good_page = page;
+		}
+	}
+
+	/* Whee, all the pages are uptodate! */
+	if (!io->good_page) {
+		pr_debug("all pages uptodate!\n");
+		io->good_idx = io->nr_pages;
+		__iodesc_read_finish(io);
+		return;
+	}
+
+	pr_debug("locking good_page\n");
+	if (wtd_lock_page(&io->wtd, io->good_page))
+		__iodesc_make_uptodate(io);
+	return;
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data)
+{
+	struct iodesc *io = data;
+	__generic_file_read_iodesc(io, 1);
+}
+
+static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb,
+			 size_t size, loff_t pos);
+
+int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return generic_file_rw_kvec(file, READ, cb, size, pos);
+}
+
+int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return generic_file_rw_kvec(file, WRITE, cb, size, pos);
+}
+
+void wtd_rw_kvec_core(void *);
+int rw_kvec_core(struct iodesc *io);
+
+int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb,
+			 size_t size, loff_t pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int append = file->f_flags & O_APPEND;
+	struct iodesc *io = NULL;
+	int ret;
+
+	ret = -EINVAL;
+	if (unlikely(rw != READ && rw != WRITE))
+		goto out;
+
+	/* Don't check pos when appending, but otherwise do santity 
+	 * checks before allocating memory.  -'ve offsets are invalid.
+	 */
+	if (unlikely(!append && pos < 0))
+		goto out;
+
+	ret = -ENOMEM;
+	io = kmalloc(sizeof(*io), GFP_KERNEL);
+	if (!io)
+		goto out;
+
+	memset(io, 0, sizeof(*io));
+	io->file = file;
+	io->rw = rw;
+	io->cb = cb;
+	io->size = size;
+	io->pos = pos;
+	io->rlimit_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	wtd_set_action(&io->wtd, wtd_rw_kvec_core, io);
+
+	if ((rw == READ) || (0 == wtd_down(&io->wtd, &inode->i_sem)))
+		return rw_kvec_core(io);
+
+	return 0;
+
+out:
+	if (!ret)
+		cb.fn(cb.data, cb.vec, ret);
+	return ret;
+}
+
+void wtd_rw_kvec_core(void *data)
+{
+	struct iodesc *io = data;
+	kvec_cb_t cb = io->cb;
+	int ret = rw_kvec_core(io);
+	if (ret)
+		cb.fn(cb.data, cb.vec, ret);
+}
+
+int rw_kvec_core(struct iodesc *io)
+{
+	int append = io->file->f_flags & O_APPEND;
+	struct inode *inode = io->file->f_dentry->d_inode;
+	struct address_space *as = inode->i_mapping;
+	unsigned long index;
+	unsigned long eindex;
+	unsigned long nr_pages;
+	int ret;
+
+	if (io->rw == WRITE) {
+		unsigned long long tmp;
+		loff_t limit;
+
+		/* We've already down'd the inode semaphore */
+		if (append)
+			io->pos = inode->i_size;
+
+		limit = io->rlimit_fsize;
+		if (likely(RLIM_INFINITY == limit))
+			limit = OFFSET_MAX;
+
+		/* Filesystem limits take precedence over user limits */
+		if (likely(inode->i_sb->s_maxbytes < limit))
+			limit = inode->i_sb->s_maxbytes;
+
+	        if (unlikely(io->pos >= limit)) {
+			pr_debug("maxbytes: %Ld\n", limit);
+			ret = 0;
+			if (io->size || io->pos > limit)
+				ret = -EFBIG;
+			goto out_io;
+		}
+
+		/* Clamp writes straddling limit. */
+		tmp = io->pos + io->size;
+		if (unlikely(tmp > (unsigned long long)limit))
+			io->size = limit - io->pos;
+	}
+
+	if (READ == io->rw) {
+		pr_debug("io->pos=%Ld i_size=%Ld\n", io->pos, inode->i_size);
+
+		if (io->pos > inode->i_size)
+			io->size = 0;
+		else if ((io->pos + io->size) > inode->i_size) {
+			size_t size = inode->i_size - io->pos;
+			if (size < io->size)
+				io->size = size;
+		}
+
+		pr_debug("io->size=%d\n", io->size);
+	}
+
+	ret = 0;
+	if (unlikely(!io->size))
+		goto out_io;
+
+	index = io->pos >> PAGE_CACHE_SHIFT;
+	eindex = (io->pos + io->size - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = eindex - index + 1;
+
+	pr_debug("nr_pages: %lu\n", nr_pages);
+
+	io->good_idx = -1;
+	io->good_page = NULL;
+	io->did_read = 0;
+	io->err = 0;
+	io->as = as;
+	io->offset = (unsigned long)io->pos & (PAGE_CACHE_SIZE - 1);
+	kvec_dst_init(&io->src);
+	kvec_dst_set(&io->src, io->cb.vec->veclet);
+	io->veclet = io->cb.vec->veclet;
+	if (nr_pages < READDESC_NR_DEF) {
+		io->pages = io->def_pages;
+		io->new_pages = io->def_new_pages;
+	} else {
+		io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->pages)
+			goto out_io;
+
+		io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->new_pages)
+			goto out_pages;
+	}
+
+	ret = address_space_map(as, index, nr_pages, io->pages,
+			&io->nr_new_pages, io->new_pages);
+	pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages);
+	if (ret <= 0)
+		goto out_new_pages;
+
+	io->nr_pages = ret;
+	io->pages[io->nr_pages] = NULL;
+	io->new_pages[io->nr_new_pages] = NULL;
+
+	if (io->rw == READ)
+		__generic_file_read_iodesc(io, 0);
+	else if (io->rw == WRITE)
+		__generic_file_write_iodesc(io);
+
+	return 0;
+
+out_new_pages:
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+out_pages:
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+out_io:
+	if (io->rw == WRITE)
+		up(&inode->i_sem);
+	if (!ret)
+		io->cb.fn(io->cb.data, io->cb.vec, ret);
+	kfree(io);
+	return ret;
+}
diff -urNp x-ref/mm/memory.c x/mm/memory.c
--- x-ref/mm/memory.c	2003-02-14 05:22:27.000000000 +0100
+++ x/mm/memory.c	2003-02-14 05:22:30.000000000 +0100
@@ -45,6 +45,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -447,9 +448,11 @@ static struct page * follow_page(struct 
 	pte = *ptep;
 	pte_kunmap(ptep);
 	if (pte_present(pte)) {
+		struct page * page = pte_page(pte);
+		prefetch(page);
 		if (!write ||
 		    (pte_write(pte) && pte_dirty(pte)))
-			return pte_page(pte);
+			return page;
 	}
 	if (pte_none(pte))
 		none = 1;
@@ -1599,3 +1602,233 @@ struct page * vmalloc_to_page(void * vma
 	}
 	return page;
 }
+
+/*
+ * Force in an entire range of pages from the current process's user VA,
+ * and pin them in physical memory.  
+ * FIXME: some architectures need to flush the cache based on user addresses 
+ * here.  Someone please provide a better macro than flush_cache_page.
+ */
+
+#define dprintk(x...)
+atomic_t user_pinned_pages = ATOMIC_INIT(0);
+
+struct kvec *map_user_kvec(int rw, unsigned long ptr, size_t len)
+{
+	return mm_map_user_kvec(current->mm, rw, ptr, len);
+}
+
+struct kvec *mm_map_user_kvec(struct mm_struct *mm, int rw, unsigned long ptr,
+			      size_t len)
+{
+	struct kvec		*vec;
+	struct kveclet		*veclet;
+	unsigned long		end;
+	int			err;
+	int			i;
+	int			datain = (rw == READ);
+	int			nr_pages;
+
+	end = ptr + len;
+	if (unlikely(end < ptr))
+		return ERR_PTR(-EINVAL);
+
+	nr_pages = (ptr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nr_pages -= ptr >> PAGE_SHIFT;
+	nr_pages ++;
+
+	atomic_add(nr_pages, &user_pinned_pages);
+	err = -EAGAIN;
+	if (unlikely(atomic_read(&user_pinned_pages) >= aio_max_pinned))
+		goto out_adjust;
+
+	vec = kmalloc(sizeof(struct kvec) + nr_pages * sizeof(struct kveclet),
+			GFP_KERNEL);
+	err = -ENOMEM;
+	if (unlikely(!vec))
+		goto out_adjust;
+
+	vec->nr = 0;
+	vec->max_nr = nr_pages;
+	veclet = vec->veclet;
+	
+	/* Make sure the iobuf is not already mapped somewhere. */
+	dprintk ("map_user_kiobuf: begin\n");
+	
+	down_read(&mm->mmap_sem);
+
+	err = -EFAULT;
+
+	i = 0;
+
+	/* 
+	 * First of all, try to fault in all of the necessary pages
+	 */
+	while (ptr < end) {
+		veclet->offset = ptr & ~PAGE_MASK;
+		veclet->length = PAGE_SIZE - veclet->offset;
+		if (len < veclet->length)
+			veclet->length = len;
+		ptr &= PAGE_MASK;
+		len -= veclet->length;
+
+		err = get_user_pages(current, mm, ptr, 1,
+				     datain, 0, &veclet->page, NULL);
+		if (unlikely(err < 0))
+			goto out_unlock;
+
+		veclet++;
+		ptr += PAGE_SIZE;
+		vec->nr = ++i;
+	}
+
+	veclet->page = NULL;	/* dummy for the prefetch in free_kvec */
+	veclet->length = 0;	/* bug checking ;-) */
+
+	up_read(&mm->mmap_sem);
+	dprintk ("map_user_kiobuf: end OK\n");
+	return vec;
+
+ out_unlock:
+	up_read(&mm->mmap_sem);
+	unmap_kvec(vec, 0);
+	kfree(vec);
+	dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw);
+	return ERR_PTR(err);
+
+ out_adjust:
+	atomic_sub(nr_pages, &user_pinned_pages);
+	dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw);
+	return ERR_PTR(err);
+}
+
+/*
+ * Unmap all of the pages referenced by a kiobuf.  We release the pages,
+ * and unlock them if they were locked. 
+ */
+
+void unmap_kvec (struct kvec *vec, int dirtied)
+{
+	struct kveclet *veclet = vec->veclet;
+	struct kveclet *end = vec->veclet + vec->nr;
+	struct page *map = veclet->page;
+
+	prefetchw(map);
+	for (; veclet<end; map = (++veclet)->page) {
+		prefetchw(veclet[1].page);
+		if (likely(map != NULL) && !PageReserved(map)) {
+			if (dirtied) {
+				SetPageDirty(map);
+				flush_dcache_page(map);	/* FIXME */
+			}
+			__free_page(map);
+		}
+	}
+
+	atomic_sub(vec->max_nr, &user_pinned_pages);
+	vec->nr = 0;
+}
+
+void free_kvec(struct kvec *vec)
+{
+	if (unlikely(vec->nr))
+		BUG();
+	kfree(vec);
+}
+
+/* kvec memory copy helper: appends len bytes in from to dst.
+ */
+void memcpy_to_kvec_dst(struct kvec_dst *dst, const char *from, long len)
+{
+	if (unlikely(len < 0))
+		BUG();
+	do {
+		int cnt = len;
+		if (dst->space < cnt)
+			cnt = dst->space;
+
+		memcpy(dst->dst, from, cnt);
+		from += cnt;
+		dst->space -= cnt;
+		dst->dst += cnt;
+		len -= cnt;
+		if (!dst->space && len) {
+			kvec_dst_unmap(dst);
+			dst->let++;
+			dst->offset = 0;
+			kvec_dst_map(dst);
+			if (unlikely(!dst->space))
+				BUG();
+		}
+	} while (len);
+}
+
+/* kvec memory copy helper: copies and consumes len bytes in from to dst.
+ */
+void memcpy_from_kvec_dst(char *to, struct kvec_dst *from, long len)
+{
+	if (unlikely(len < 0))
+		BUG();
+	do {
+		int cnt = len;
+		if (from->space < cnt)
+			cnt = from->space;
+
+		memcpy(to, from->dst, cnt);
+		to += cnt;
+		from->space -= cnt;
+		from->dst += cnt;
+		len -= cnt;
+		if (unlikely(!from->space && len)) {
+			kvec_dst_unmap(from);
+			from->let++;
+			from->offset = 0;
+			kvec_dst_map(from);
+			if (unlikely(!from->space))
+				BUG();
+		}
+	} while (len);
+}
+
+/*
+ */
+int copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len)
+{
+	struct kveclet *let = to->veclet;
+	int ret = 0;
+
+	if ((ssize_t)len < 0)
+		BUG();
+
+	while (offset) {
+		if (offset < let->length)
+			break;
+		offset -= let->length;
+		let++;
+
+		if ((let - to->veclet) > to->nr)
+			BUG();
+	}
+
+	/* FIXME: kmap deadlockage */
+	while (len && !ret) {
+		char *dst = kmap(let->page);
+		size_t this;
+
+		this = let->length - offset;
+		if (len < this)
+			this = len;
+
+		offset += let->offset;
+		if (copy_from_user(dst+offset, from, this))
+			ret = -EFAULT;
+
+		from += this;
+		len -= this;
+		kunmap(let->page);
+		offset = 0;
+		let ++;
+	}
+
+	return ret;
+}
diff -urNp x-ref/mm/wtd.c x/mm/wtd.c
--- x-ref/mm/wtd.c	1970-01-01 01:00:00.000000000 +0100
+++ x/mm/wtd.c	2003-02-14 05:22:30.000000000 +0100
@@ -0,0 +1,77 @@
+#include <linux/worktodo.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+static void __wtd_lock_page_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct page *page = (struct page *)wtd->data;
+
+	if (!TryLockPage(page)) {
+		__remove_wait_queue(page_waitqueue(page), &wtd->wait);
+		wtd_queue(wtd);
+	} else
+		async_run_tq_disk();
+}
+
+int wtd_lock_page(struct worktodo *wtd, struct page *page)
+{
+	if (TryLockPage(page)) {
+		wtd->data = page;
+		init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter);
+
+		/* Wakeups may race with TryLockPage, so try again within the wait 
+		 * queue spinlock.
+		 */
+		if (!add_wait_queue_cond(page_waitqueue(page), &wtd->wait,
+					TryLockPage(page))) {
+			/* Page is still locked.  Kick the disk queue... */
+			run_task_queue(&tq_disk);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static void __wtd_bh_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct buffer_head *bh = (struct buffer_head *)wtd->data;
+
+	if (!buffer_locked(bh)) {
+		__remove_wait_queue(&bh->b_wait, &wtd->wait);
+		wtd_queue(wtd);
+	} else
+		async_run_tq_disk();
+}
+
+int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh)
+{
+	if (!buffer_locked(bh)) {
+		return 1;
+	}
+	wtd->data = bh;
+	init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter);
+	if (add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh)))
+		return 1;
+	run_task_queue(&tq_disk);
+	return 0;
+}
+
+static void do_run_tq_disk(void *data)
+{
+	run_task_queue(&tq_disk);
+}
+
+static struct tq_struct run_disk_tq = {
+	.routine = do_run_tq_disk,
+};
+
+void async_run_tq_disk(void)
+{
+	mb(); /* going to read tq_disk locklessy */
+	if (TQ_ACTIVE(tq_disk))
+		schedule_task(&run_disk_tq);
+}