diff -urNp x-ref/Documentation/AIO-NOTES x/Documentation/AIO-NOTES --- x-ref/Documentation/AIO-NOTES 1970-01-01 01:00:00.000000000 +0100 +++ x/Documentation/AIO-NOTES 2003-02-14 05:22:30.000000000 +0100 @@ -0,0 +1,3 @@ +- aio context destruction is now synchronous: it waits for all pending + ios to complete. This will now cause a task that is exiting to be + delayed if outstanding ios are executing. diff -urNp x-ref/MAINTAINERS x/MAINTAINERS --- x-ref/MAINTAINERS 2003-02-14 05:22:25.000000000 +0100 +++ x/MAINTAINERS 2003-02-14 05:22:30.000000000 +0100 @@ -237,6 +237,12 @@ M: layes@loran.com L: linux-net@vger.kernel.org S: Maintained +ASYNC IO +P: Benjamin LaHaise +M: bcrl@redhat.com +L: linux-aio@kvack.org +S: Maintained + AX.25 NETWORK LAYER P: Matthias Welwarsky M: dg2fef@afthd.tu-darmstadt.de diff -urNp x-ref/arch/i386/kernel/entry.S x/arch/i386/kernel/entry.S --- x-ref/arch/i386/kernel/entry.S 2003-02-14 05:22:21.000000000 +0100 +++ x/arch/i386/kernel/entry.S 2003-02-14 05:22:30.000000000 +0100 @@ -659,11 +659,11 @@ ENTRY(sys_call_table) .long SYMBOL_NAME(sys_sched_getaffinity) .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_thread_area */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_get_thread_area */ - .long SYMBOL_NAME(sys_ni_syscall) /* 245 sys_io_setup */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_destroy */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_getevents */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_submit */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_cancel */ + .long SYMBOL_NAME(sys_io_setup) /* 245 */ + .long SYMBOL_NAME(sys_io_destroy) + .long SYMBOL_NAME(sys_io_getevents) + .long SYMBOL_NAME(sys_io_submit) + .long SYMBOL_NAME(sys_io_cancel) .long SYMBOL_NAME(sys_ni_syscall) /* 250 sys_alloc_hugepages */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_free_hugepages */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_exit_group */ diff -urNp x-ref/arch/i386/kernel/semaphore.c x/arch/i386/kernel/semaphore.c --- x-ref/arch/i386/kernel/semaphore.c 2002-11-29 02:22:55.000000000 +0100 +++ x/arch/i386/kernel/semaphore.c 2003-02-14 05:22:30.000000000 +0100 @@ -14,6 +14,7 @@ */ #include #include +#include #include /* @@ -54,6 +55,54 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; +void __wtd_down(struct semaphore * sem, struct worktodo *wtd); + +void __wtd_down_action(void *data) +{ + struct worktodo *wtd = data; + struct semaphore *sem; + + wtd_pop(wtd); + sem = wtd->data; + + __wtd_down(sem, wtd); +} + +void __wtd_down_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct semaphore *sem = wtd->data; + + __remove_wait_queue(&sem->wait, &wtd->wait); + wtd_push(wtd, __wtd_down_action, wtd); + wtd_queue(wtd); +} + +void __wtd_down(struct semaphore * sem, struct worktodo *wtd) +{ + int gotit; + int sleepers; + + init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter); + wtd->data = sem; + + spin_lock_irq(&semaphore_lock); + sem->sleepers++; + sleepers = sem->sleepers; + gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait, + atomic_add_negative(sleepers - 1, &sem->count)); + if (gotit) + sem->sleepers = 0; + else + sem->sleepers = 1; + spin_unlock_irq(&semaphore_lock); + + if (gotit) { + wake_up(&sem->wait); + wtd_queue(wtd); + } +} + void __down(struct semaphore * sem) { struct task_struct *tsk = current; @@ -257,6 +306,21 @@ asm( "ret" ); +asm( +".text\n" +".align 4\n" +".globl __wtd_down_failed\n" +"__wtd_down_failed:\n\t" + "pushl %eax\n\t" + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __wtd_down\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" + "popl %eax\n\t" + "ret" +); + /* * rw spinlock fallbacks */ diff -urNp x-ref/arch/x86_64/kernel/semaphore.c x/arch/x86_64/kernel/semaphore.c --- x-ref/arch/x86_64/kernel/semaphore.c 2002-11-29 02:22:58.000000000 +0100 +++ x/arch/x86_64/kernel/semaphore.c 2003-02-14 05:22:30.000000000 +0100 @@ -14,6 +14,7 @@ */ #include #include +#include #include @@ -167,4 +168,51 @@ int __down_trylock(struct semaphore * se return 1; } +void __wtd_down(struct semaphore * sem, struct worktodo *wtd); + +void __wtd_down_action(void *data) +{ + struct worktodo *wtd = data; + struct semaphore *sem; + + wtd_pop(wtd); + sem = wtd->data; + + __wtd_down(sem, wtd); +} + +void __wtd_down_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct semaphore *sem = wtd->data; + + __remove_wait_queue(&sem->wait, &wtd->wait); + wtd_push(wtd, __wtd_down_action, wtd); + wtd_queue(wtd); +} + +void __wtd_down(struct semaphore * sem, struct worktodo *wtd) +{ + int gotit; + int sleepers; + + init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter); + wtd->data = sem; + + spin_lock_irq(&semaphore_lock); + sem->sleepers++; + sleepers = sem->sleepers; + gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait, + atomic_add_negative(sleepers - 1, &sem->count)); + if (gotit) + sem->sleepers = 0; + else + sem->sleepers = 1; + spin_unlock_irq(&semaphore_lock); + + if (gotit) { + wake_up(&sem->wait); + wtd_queue(wtd); + } +} diff -urNp x-ref/arch/x86_64/lib/thunk.S x/arch/x86_64/lib/thunk.S --- x-ref/arch/x86_64/lib/thunk.S 2002-11-29 02:22:58.000000000 +0100 +++ x/arch/x86_64/lib/thunk.S 2003-02-14 05:22:30.000000000 +0100 @@ -41,7 +41,8 @@ thunk_retrax __down_failed_interruptible,__down_interruptible thunk_retrax __down_failed_trylock,__down_trylock thunk __up_wakeup,__up - + thunk __wtd_down_failed,__wtd_down + restore: RESTORE_ARGS ret diff -urNp x-ref/drivers/char/raw.c x/drivers/char/raw.c --- x-ref/drivers/char/raw.c 2003-02-14 05:22:15.000000000 +0100 +++ x/drivers/char/raw.c 2003-02-14 05:22:30.000000000 +0100 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define dprintk(x...) @@ -36,7 +38,8 @@ int raw_open(struct inode *, struct file int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); int raw_ioctl(struct inode *, struct file *, unsigned int, unsigned long); - +int raw_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_write(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos); static struct file_operations raw_fops = { read: raw_read, @@ -44,6 +47,10 @@ static struct file_operations raw_fops = open: raw_open, release: raw_release, ioctl: raw_ioctl, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, + kvec_read: raw_kvec_read, + kvec_write: raw_kvec_write, }; static struct file_operations raw_ctl_fops = { @@ -407,3 +414,100 @@ ssize_t rw_raw_dev(int rw, struct file * out: return err; } + +static int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos); +int raw_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, READ, cb, size, pos); +} + +int raw_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return raw_kvec_rw(file, WRITE, cb, size, pos); +} + +int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos) +{ + int err; + unsigned minor; + kdev_t dev; + unsigned long limit, blocknr, blocks; + + unsigned sector_size, sector_bits, sector_mask; + unsigned max_sectors; + unsigned i; + + pr_debug("raw_kvec_rw: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos); + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; + sector_mask = sector_size- 1; + max_sectors = KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + pr_debug ("raw_kvec_rw: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + /* EOF at the end */ + err = 0; + if (!size || (pos >> sector_bits) == limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + cb.fn(cb.data, cb.vec, err); + return 0; + } + + /* ENXIO for io beyond the end */ + err = -ENXIO; + if ((pos >> sector_bits) >= limit) { + pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + err = -EINVAL; + if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) { + pr_debug("pos(%Ld)/size(%lu) wrong(%d)\n", pos, size, sector_mask); + goto out; + } + + /* Verify that the scatter-gather list is sector aligned. */ + for (i=0; inr; i++) + if ((cb.vec->veclet[i].offset & sector_mask) || + (cb.vec->veclet[i].length & sector_mask)) { + pr_debug("veclet offset/length wrong"); + goto out; + } + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + blocknr = pos >> sector_bits; + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + err = -ENXIO; + if (!blocks) { + pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr); + goto out; + } + + err = brw_kvec_async(rw, cb, dev, blocks, blocknr, sector_bits); + async_run_tq_disk(); +out: + if (err) + printk(KERN_DEBUG "raw_kvec_rw: ret is %d\n", err); + return err; +} + diff -urNp x-ref/fs/Makefile x/fs/Makefile --- x-ref/fs/Makefile 2003-02-14 05:22:25.000000000 +0100 +++ x/fs/Makefile 2003-02-14 05:22:30.000000000 +0100 @@ -20,6 +20,9 @@ obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o +obj-y += aio.o +export-objs += aio.o + subdir-$(CONFIG_PROC_FS) += proc subdir-y += partitions diff -urNp x-ref/fs/aio.c x/fs/aio.c --- x-ref/fs/aio.c 1970-01-01 01:00:00.000000000 +0100 +++ x/fs/aio.c 2003-02-14 05:22:30.000000000 +0100 @@ -0,0 +1,1377 @@ +/* + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements an efficient asynchronous io interface. + * + * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * + * See ../COPYING for licensing terms. + */ +#include +#include +#include +#include + +//#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if DEBUG > 1 +#define dprintk printk +#else +#define dprintk(x...) do { ; } while (0) +#endif + +/*------ sysctl variables----*/ +unsigned aio_nr; /* current system wide number of aio requests */ +unsigned aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +unsigned aio_max_size = 0x20000; /* 128KB per chunk */ +unsigned aio_max_pinned; /* set to mem/4 in aio_setup */ +/*----end sysctl variables---*/ + +static kmem_cache_t *kiocb_cachep; +static kmem_cache_t *kioctx_cachep; + +/* Used for rare fput completion. */ +static void aio_fput_routine(void *); +static struct tq_struct fput_tqueue = { + routine: aio_fput_routine, +}; + +static spinlock_t fput_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(fput_head); + +/* forward prototypes */ +static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res); +static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res); + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiocb_cachep) + panic("unable to create kiocb cache\n"); + + kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kioctx_cachep) + panic("unable to create kioctx cache"); + + aio_max_pinned = num_physpages/4; + + printk(KERN_NOTICE "aio_setup: num_physpages = %u\n", aio_max_pinned); + printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + + return 0; +} + +static void ioctx_free_reqs(struct kioctx *ctx) +{ + struct list_head *pos, *next; + list_for_each_safe(pos, next, &ctx->free_reqs) { + struct kiocb *iocb = list_kiocb(pos); + list_del(&iocb->list); + kmem_cache_free(kiocb_cachep, iocb); + } +} + +static void aio_free_ring(struct kioctx *ctx) +{ + struct aio_ring_info *info = &ctx->ring_info; + + if (info->kvec) { + unmap_kvec(info->kvec, 1); + free_kvec(info->kvec); + } + + if (info->mmap_size) { + down_write(&ctx->mm->mmap_sem); + do_munmap(ctx->mm, info->mmap_base, info->mmap_size); + up_write(&ctx->mm->mmap_sem); + } + + if (info->ring_pages && info->ring_pages != info->internal_pages) + kfree(info->ring_pages); + info->ring_pages = NULL; + info->nr = 0; +} + +static int aio_setup_ring(struct kioctx *ctx) +{ + struct aio_ring *ring; + struct aio_ring_info *info = &ctx->ring_info; + unsigned nr_reqs = ctx->max_reqs; + unsigned long size; + int nr_pages, i; + + /* Compensate for the ring buffer's head/tail overlap entry */ + nr_reqs += 2; /* 1 is required, 2 for good luck */ + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_reqs; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + + if (nr_pages < 0) + return -EINVAL; + + info->nr_pages = nr_pages; + + nr_reqs = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + + info->nr = 0; + info->ring_pages = info->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + if (!info->ring_pages) + return -ENOMEM; + memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + } + + info->mmap_size = nr_pages * PAGE_SIZE; + dprintk("attempting mmap of %lu bytes\n", info->mmap_size); + down_write(&ctx->mm->mmap_sem); + info->mmap_base = do_mmap(NULL, 0, info->mmap_size, + PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, + 0); + up_write(&ctx->mm->mmap_sem); + if (IS_ERR((void *)info->mmap_base)) { + printk("mmap err: %ld\n", -info->mmap_base); + info->mmap_size = 0; + aio_free_ring(ctx); + return -EAGAIN; + } + dprintk("mmap address: 0x%08lx\n", info->mmap_base); + info->kvec = map_user_kvec(READ, info->mmap_base, info->mmap_size); + if (unlikely(IS_ERR(info->kvec))) { + info->kvec = NULL; + aio_free_ring(ctx); + return -EAGAIN; + } + + if (unlikely(info->kvec->nr != nr_pages)) + BUG(); + + for (i=0; ikvec->veclet[i].offset)) + BUG(); + info->ring_pages[i] = info->kvec->veclet[i].page; + //printk("[%d] %p -> %p\n", i, info->kvec->veclet[i].page, + // info->pages[i]); + } + + + ctx->user_id = info->mmap_base; + + info->nr = nr_reqs; /* trusted copy */ + + ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring->nr = nr_reqs; /* user copy */ + ring->id = ctx->user_id; + kunmap_atomic(ring, KM_USER0); + + return 0; +} + +/* aio_ring_event: returns a pointer to the event at the given index from + * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); + */ +static inline struct io_event *aio_ring_event(struct aio_ring_info *info, int nr, enum km_type km) +{ + struct io_event *events; +#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) + + if (nr < AIO_EVENTS_FIRST_PAGE) { + struct aio_ring *ring; + ring = kmap_atomic(info->ring_pages[0], km); + return &ring->io_events[nr]; + } + nr -= AIO_EVENTS_FIRST_PAGE; + + events = kmap_atomic(info->ring_pages[1 + nr / AIO_EVENTS_PER_PAGE], km); + + return events + (nr % AIO_EVENTS_PER_PAGE); +} + +static inline void put_aio_ring_event(struct io_event *event, enum km_type km) +{ + void *p = (void *)((unsigned long)event & PAGE_MASK); + kunmap_atomic(p, km); +} + +/* ioctx_alloc + * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(unsigned nr_reqs) +{ + struct kioctx *ctx; + unsigned i; + + /* Prevent overflows */ + if ((nr_reqs > (0x10000000U / sizeof(struct io_event))) || + (nr_reqs > (0x10000000U / sizeof(struct kiocb)))) { + pr_debug("ENOMEM: nr_reqs too high\n"); + return ERR_PTR(-EINVAL); + } + + if (nr_reqs > aio_max_nr) + return ERR_PTR(-EAGAIN); + + ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + ctx->max_reqs = nr_reqs; + ctx->mm = current->mm; + atomic_inc(&ctx->mm->mm_count); + + atomic_set(&ctx->users, 1); + spin_lock_init(&ctx->lock); + spin_lock_init(&ctx->ring_info.ring_lock); + init_waitqueue_head(&ctx->wait); + + INIT_LIST_HEAD(&ctx->free_reqs); + INIT_LIST_HEAD(&ctx->active_reqs); + + if (aio_setup_ring(ctx) < 0) + goto out_freectx; + + /* Allocate nr_reqs iocbs for io. Free iocbs are on the + * ctx->free_reqs list. When active they migrate to the + * active_reqs list. During completion and cancellation + * the request may temporarily not be on any list. + */ + for (i=0; ikey = i; + iocb->users = 0; + list_add(&iocb->list, &ctx->free_reqs); + } + + /* now link into global list. kludge. FIXME */ + br_write_lock(BR_AIO_REQ_LOCK); + if (unlikely(aio_nr + ctx->max_reqs > aio_max_nr)) + goto out_cleanup; + aio_nr += ctx->max_reqs; /* undone by __put_ioctx */ + ctx->next = current->mm->ioctx_list; + current->mm->ioctx_list = ctx; + br_write_unlock(BR_AIO_REQ_LOCK); + + dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, current->mm, ctx->ring_info.ring->nr); + return ctx; + +out_cleanup: + br_write_unlock(BR_AIO_REQ_LOCK); + ctx->max_reqs = 0; /* prevent __put_ioctx from sub'ing aio_nr */ + __put_ioctx(ctx); + return ERR_PTR(-EAGAIN); + +out_freering: + aio_free_ring(ctx); + ioctx_free_reqs(ctx); +out_freectx: + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + + dprintk("aio: error allocating ioctx %p\n", ctx); + return ctx; +} + +/* aio_cancel_all + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void aio_cancel_all(struct kioctx *ctx) +{ + int (*cancel)(struct kiocb *, struct io_event *); + struct io_event res; + spin_lock_irq(&ctx->lock); + ctx->dead = 1; + while (!list_empty(&ctx->active_reqs)) { + struct list_head *pos = ctx->active_reqs.next; + struct kiocb *iocb = list_kiocb(pos); + list_del_init(&iocb->list); + cancel = iocb->cancel; + if (cancel) + iocb->users++; + spin_unlock_irq(&ctx->lock); + if (cancel) + cancel(iocb, &res); + spin_lock_irq(&ctx->lock); + } + spin_unlock_irq(&ctx->lock); +} + +void wait_for_all_aios(struct kioctx *ctx) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + if (!ctx->reqs_active) + return; + + add_wait_queue(&ctx->wait, &wait); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (ctx->reqs_active) { + dprintk("ctx->reqs_active = %d\n", ctx->reqs_active); + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); +} + +/* exit_aio: called when the last user of mm goes away. At this point, + * there is no way for any new requests to be submited or any of the + * io_* syscalls to be called on the context. However, there may be + * outstanding requests which hold references to the context; as they + * go away, they will call put_ioctx and release any pinned memory + * associated with the request (held via struct page * references). + */ +void exit_aio(struct mm_struct *mm) +{ + struct kioctx *ctx = mm->ioctx_list; + mm->ioctx_list = NULL; + while (ctx) { + struct kioctx *next = ctx->next; + ctx->next = NULL; + aio_cancel_all(ctx); + + wait_for_all_aios(ctx); + + if (1 != atomic_read(&ctx->users)) + dprintk(KERN_DEBUG + "exit_aio:ioctx still alive: %d %d %d\n", + atomic_read(&ctx->users), ctx->dead, + ctx->reqs_active); + put_ioctx(ctx); + ctx = next; + } +} + +/* __put_ioctx + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +void __put_ioctx(struct kioctx *ctx) +{ + unsigned nr_reqs = ctx->max_reqs; + + if (unlikely(ctx->reqs_active)) + BUG(); + + aio_free_ring(ctx); + mmdrop(ctx->mm); + ctx->mm = NULL; + pr_debug("__put_ioctx: freeing %p\n", ctx); + ioctx_free_reqs(ctx); + kmem_cache_free(kioctx_cachep, ctx); + + br_write_lock(BR_AIO_REQ_LOCK); + aio_nr -= nr_reqs; + br_write_unlock(BR_AIO_REQ_LOCK); +} + +/* aio_get_req + * Allocate a slot for an aio request. Increments the users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns -EAGAIN if no requests are free. + */ +static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx)); +static struct kiocb *__aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req = NULL; + struct aio_ring *ring; + + /* Use cmpxchg instead of spin_lock? */ + spin_lock_irq(&ctx->lock); + ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); + if (likely(!list_empty(&ctx->free_reqs) && + (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)))) { + req = list_kiocb(ctx->free_reqs.next); + list_del(&req->list); + list_add(&req->list, &ctx->active_reqs); + ctx->reqs_active++; + req->user_obj = NULL; + get_ioctx(ctx); + + if (unlikely(req->ctx != NULL)) + BUG(); + req->ctx = ctx; + if (unlikely(req->users)) + BUG(); + req->users = 1; + } + kunmap_atomic(ring, KM_USER0); + spin_unlock_irq(&ctx->lock); + + return req; +} + +static inline struct kiocb *aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req; + /* Handle a potential starvation case -- should be exceedingly rare as + * requests will be stuck on fput_head only if the aio_fput_routine is + * delayed and the requests were the last user of the struct file. + */ + req = __aio_get_req(ctx); + if (unlikely(NULL == req)) { + aio_fput_routine(NULL); + req = __aio_get_req(ctx); + } + return req; +} + +static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +{ + req->ctx = NULL; + req->filp = NULL; + req->user_obj = NULL; + ctx->reqs_active--; + list_add(&req->list, &ctx->free_reqs); + + if (unlikely(!ctx->reqs_active && ctx->dead)) + wake_up(&ctx->wait); +} + +static void aio_fput_routine(void *data) +{ + spin_lock_irq(&fput_lock); + while (likely(!list_empty(&fput_head))) { + struct kiocb *req = list_kiocb(fput_head.next); + struct kioctx *ctx = req->ctx; + + list_del(&req->list); + spin_unlock_irq(&fput_lock); + + /* Complete the fput */ + __fput(req->filp); + + /* Link the iocb into the context's free list */ + spin_lock_irq(&ctx->lock); + really_put_req(ctx, req); + spin_unlock_irq(&ctx->lock); + + put_ioctx(ctx); + spin_lock_irq(&fput_lock); + } + spin_unlock_irq(&fput_lock); +} + +/* __aio_put_req + * Returns true if this put was the last user of the request. + */ +static inline int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +{ + dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", + req, atomic_read(&req->filp->f_count)); + + req->users --; + if (unlikely(req->users < 0)) + BUG(); + if (likely(req->users)) + return 0; + list_del(&req->list); /* remove from active_reqs */ + req->cancel = NULL; + + /* Must be done under the lock to serialise against cancellation. + * Call this aio_fput as it duplicates fput via the fput_tqueue. + */ + if (unlikely(atomic_dec_and_test(&req->filp->f_count))) { + get_ioctx(ctx); + spin_lock(&fput_lock); + list_add(&req->list, &fput_head); + spin_unlock(&fput_lock); + schedule_task(&fput_tqueue); + } else + really_put_req(ctx, req); + return 1; +} + +/* aio_put_req + * Returns true if this put was the last user of the kiocb, + * false if the request is still in use. + */ +int aio_put_req(struct kiocb *req) +{ + struct kioctx *ctx = req->ctx; + int ret; + spin_lock_irq(&ctx->lock); + ret = __aio_put_req(ctx, req); + spin_unlock_irq(&ctx->lock); + if (ret) + put_ioctx(ctx); + return ret; +} + +/* Lookup an ioctx id. ioctx_list is lockless for reads. + * FIXME: this is O(n) and is only suitable for development. + */ +static inline struct kioctx *lookup_ioctx(unsigned long ctx_id) +{ + struct kioctx *ioctx; + struct mm_struct *mm; + + mm = current->mm; + br_read_lock(BR_AIO_REQ_LOCK); + for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) + if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { + get_ioctx(ioctx); + break; + } + br_read_unlock(BR_AIO_REQ_LOCK); + + return ioctx; +} + +/* aio_complete + * Called when the io request on the given iocb is complete. + * Returns true if this is the last user of the request. The + * only other user of the request can be the cancellation code. + */ +int aio_complete(struct kiocb *iocb, long res, long res2) +{ + struct kioctx *ctx = iocb->ctx; + struct aio_ring_info *info = &ctx->ring_info; + struct aio_ring *ring; + struct io_event *event; + unsigned long flags; + unsigned long tail; + int ret; + + /* add a completion event to the ring buffer. + * must be done holding ctx->lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->lock, flags); + + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + + tail = info->tail; + event = aio_ring_event(info, tail, KM_IRQ0); + tail = (tail + 1) % info->nr; + + event->obj = (u64)(unsigned long)iocb->user_obj; + event->data = iocb->user_data; + event->res = res; + event->res2 = res2; + + dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->user_obj, iocb->user_data, res, res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + wmb(); + + info->tail = tail; + ring->tail = tail; + + mb(); + if (!ring->woke) + ring->woke = 1; + + put_aio_ring_event(event, KM_IRQ0); + kunmap_atomic(ring, KM_IRQ1); + + pr_debug("added to ring %p at [%lu]\n", iocb, tail); + + /* everything turned out well, dispose of the aiocb. */ + ret = __aio_put_req(ctx, iocb); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + + if (ret) + put_ioctx(ctx); + + return ret; +} + +/* aio_read_evt + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched (0 or 1 ;-) + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +{ + struct aio_ring_info *info = &ioctx->ring_info; + struct aio_ring *ring; + unsigned long head; + int ret = 0; + + ring = kmap_atomic(info->ring_pages[0], KM_USER0); + dprintk("in aio_read_evt h%lu t%lu m%lu\n", + (unsigned long)ring->head, (unsigned long)ring->tail, + (unsigned long)ring->nr); + barrier(); + if (ring->head == ring->tail) + goto out; + + spin_lock(&info->ring_lock); + + head = ring->head % info->nr; + if (head != ring->tail) { + struct io_event *evp = aio_ring_event(info, head, KM_USER1); + *ent = *evp; + head = (head + 1) % info->nr; + barrier(); + ring->head = head; + ret = 1; + put_aio_ring_event(evp, KM_USER1); + } + spin_unlock(&info->ring_lock); + +out: + kunmap_atomic(ring, KM_USER0); + dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, + (unsigned long)ring->head, (unsigned long)ring->tail); + return ret; +} + +struct timeout { + struct timer_list timer; + int timed_out; + struct task_struct *tsk; +}; + +static void timeout_func(unsigned long data) +{ + struct timeout *to = (struct timeout *)data; + + to->timed_out = 1; + wake_up_process(to->tsk); +} + +static inline void init_timeout(struct timeout *to) +{ + init_timer(&to->timer); + to->timer.data = (unsigned long)to; + to->timer.function = timeout_func; + to->timed_out = 0; + to->tsk = current; +} + +static inline void set_timeout(long start_jiffies, struct timeout *to, + const struct timespec *ts) +{ + unsigned long how_long; + + if (ts->tv_sec < 0 || (!ts->tv_sec && !ts->tv_nsec)) { + to->timed_out = 1; + return; + } + + how_long = ts->tv_sec * HZ; +#define HZ_NS (1000000000 / HZ) + how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS; + + to->timer.expires = jiffies + how_long; + add_timer(&to->timer); +} + +static inline void clear_timeout(struct timeout *to) +{ + del_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, + long min_nr, long nr, + struct io_event *event, + struct timespec *timeout) +{ + long start_jiffies = jiffies; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + int ret; + int i = 0; + struct io_event ent; + struct timeout to; + + /* needed to zero any padding within an entry (there shouldn't be + * any, but C is fun! + */ + memset(&ent, 0, sizeof(ent)); + ret = 0; + + while (likely(i < nr)) { + ret = aio_read_evt(ctx, &ent); + if (unlikely(ret <= 0)) + break; + + dprintk("read event: %Lx %Lx %Lx %Lx\n", + ent.data, ent.obj, ent.res, ent.res2); + + /* Could we split the check in two? */ + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + ret = 0; + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (min_nr <= i) + return i; + if (ret) + return ret; + + /* End fast path */ + + if (timeout) { + struct timespec ts; + ret = -EFAULT; + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + goto out; + + init_timeout(&to); + set_timeout(start_jiffies, &to, &ts); + } else { + to.timed_out = 0; + } + + while (likely(i < nr)) { + add_wait_queue_exclusive(&ctx->wait, &wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + + ret = aio_read_evt(ctx, &ent); + if (ret) + break; + if (min_nr <= i) + break; + ret = 0; + if (to.timed_out) /* Only check after read evt */ + break; + schedule(); + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + /*ret = aio_read_evt(ctx, &ent);*/ + } while (1) ; + + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + + if (unlikely(ret <= 0)) + break; + + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (timeout) + clear_timeout(&to); +out: + return i ? i : ret; +} + +/* Take an ioctx and remove it from the list of ioctx's. Protects + * against races with itself via ->dead. + */ +static void io_destroy(struct kioctx *ioctx) +{ + struct kioctx **tmp; + int was_dead; + + /* delete the entry from the list is someone else hasn't already */ + br_write_lock(BR_AIO_REQ_LOCK); + was_dead = ioctx->dead; + ioctx->dead = 1; + for (tmp = ¤t->mm->ioctx_list; *tmp && *tmp != ioctx; + tmp = &(*tmp)->next) + ; + if (*tmp) + *tmp = ioctx->next; + br_write_unlock(BR_AIO_REQ_LOCK); + + dprintk("aio_release(%p)\n", ioctx); + if (likely(!was_dead)) + put_ioctx(ioctx); /* twice for the list */ + + aio_cancel_all(ioctx); + wait_for_all_aios(ioctx); + put_ioctx(ioctx); /* once for the lookup */ +} + +/* sys_io_setup: + * Create an aio_context capable of receiving at least nr_events. + * ctxp must not point to an aio_context that already exists, and + * must be initialized to 0 prior to the call. On successful + * creation of the aio_context, *ctxp is filled in with the resulting + * handle. May fail with -EINVAL if *ctxp is not initialized, + * if the specified nr_events exceeds internal limits. May fail + * with -EAGAIN if the specified nr_events exceeds the user's limit + * of available events. May fail with -ENOMEM if insufficient kernel + * resources are available. May fail with -EFAULT if an invalid + * pointer is passed for ctxp. Will fail with -ENOSYS if not + * implemented. + */ +asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctxp); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || !nr_reqs || (int)nr_reqs < 0)) { + pr_debug("EINVAL: io_setup: ctx or nr_reqs > max\n"); + goto out; + } + + ioctx = ioctx_alloc(nr_reqs); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + ret = put_user(ioctx->user_id, ctxp); + if (!ret) + return 0; + io_destroy(ioctx); + } + +out: + return ret; +} + +/* sys_io_destroy: + * Destroy the aio_context specified. May cancel any outstanding + * AIOs and block on completion. Will fail with -ENOSYS if not + * implemented. May fail with -EFAULT if the context pointed to + * is invalid. + */ +asmlinkage long sys_io_destroy(aio_context_t ctx) +{ + struct kioctx *ioctx = lookup_ioctx(ctx); + if (likely(NULL != ioctx)) { + io_destroy(ioctx); + return 0; + } + pr_debug("EINVAL: io_destroy: invalid context id\n"); + return -EINVAL; +} + +static inline int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb, + struct iocb *iocb) +{ + ssize_t (*op)(struct file *, struct kiocb *, struct iocb *); + struct kiocb *req; + struct file *file; + ssize_t ret; + char *buf; + + /* enforce forwards compatibility on users */ + if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 || + iocb->aio_reserved3)) { + pr_debug("EINVAL: io_submit: reserve field set\n"); + return -EINVAL; + } + + /* prevent overflows */ + if (unlikely( + (iocb->aio_buf != (unsigned long)iocb->aio_buf) || + (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + ((ssize_t)iocb->aio_nbytes < 0) + )) { + pr_debug("EINVAL: io_submit: overflow check\n"); + return -EINVAL; + } + + file = fget(iocb->aio_fildes); + if (unlikely(!file)) + return -EBADF; + + req = aio_get_req(ctx); + if (unlikely(!req)) { + fput(file); + return -EAGAIN; + } + + req->filp = file; + iocb->aio_key = req->key; + ret = put_user(iocb->aio_key, &user_iocb->aio_key); + if (unlikely(ret)) { + dprintk("EFAULT: aio_key\n"); + goto out_put_req; + } + + req->user_obj = user_iocb; + req->user_data = iocb->aio_data; + req->buf = iocb->aio_buf; + req->pos = iocb->aio_offset; + req->size = iocb->aio_nbytes; + req->nr_transferred = 0; + + buf = (char *)(unsigned long)iocb->aio_buf; + + switch (iocb->aio_lio_opcode) { + case IOCB_CMD_PREAD: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + goto out_put_req; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_WRITE, buf, iocb->aio_nbytes))) + goto out_put_req; + op = file->f_op->aio_read; + break; + case IOCB_CMD_PWRITE: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + goto out_put_req; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_READ, buf, iocb->aio_nbytes))) + goto out_put_req; + op = file->f_op->aio_write; + break; + case IOCB_CMD_FDSYNC: + case IOCB_CMD_FSYNC: + op = file->f_op->aio_fsync; + break; + default: + dprintk("EINVAL: io_submit: no operation %d provided by aio\n", + iocb->aio_lio_opcode); + ret = -EINVAL; + goto out_put_req; + } + + if (unlikely(!op)) { + dprintk("EINVAL: io_submit: no operation %d provided by lowlevel\n", + iocb->aio_lio_opcode); + ret = -EINVAL; + goto out_put_req; + } + + ret = op(file, req, iocb); + if (unlikely(ret)) { + /* A completion event was sent, so + * submit is a success. */ + pr_debug("io_submit: op returned %ld\n", ret); + aio_complete(req, ret, 0); + } + return 0; + +out_put_req: + aio_put_req(req); + return ret; +} + +/* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context + * specified by ctx_id is invalid, if nr is < 0, if the iocb at + * *iocbpp[0] is not properly initialized, if the operation specified + * is invalid for the file descriptor in the iocb. May fail with + * -EFAULT if any of the data structures point to invalid data. May + * fail with -EBADF if the file descriptor specified in the first + * iocb is invalid. May fail with -EAGAIN if insufficient resources + * are available to queue any iocbs. Will return 0 if nr is 0. Will + * fail with -ENOSYS if not implemented. + */ +asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, + struct iocb **iocbpp) +{ + struct kioctx *ctx; + long ret = 0; + int i; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) + return -EFAULT; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: io_submit: invalid context id\n"); + return -EINVAL; + } + + for (i=0; ithis_size; + unsigned long buf = iocb->buf; + kvec_cb_t cb; + ssize_t res; + + iocb->this_size = iocb->size - iocb->nr_transferred; + if (iocb->this_size > aio_max_size) + iocb->this_size = aio_max_size; + + buf += iocb->nr_transferred; + cb.vec = mm_map_user_kvec(iocb->ctx->mm, rw, buf, iocb->this_size); + cb.fn = (rw == READ) ? generic_aio_complete_read + : generic_aio_complete_write; + cb.data = iocb; + + dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec); + if (unlikely(IS_ERR(cb.vec))) + goto done; + + kvec_op = (rw == READ) ? iocb->filp->f_op->kvec_read + : iocb->filp->f_op->kvec_write; + dprintk("submit: %d %d %d\n", iocb->this_size, iocb->nr_transferred, iocb->size); + res = kvec_op(iocb->filp, cb, iocb->this_size, + iocb->pos + iocb->nr_transferred); + if (!res) { + dprintk("submit okay\n"); + return; + } + dprintk("submit failed: %d\n", res); + + cb.fn(cb.data, cb.vec, res); + return; + +done: + if (unlikely(!iocb->nr_transferred)) + BUG(); + aio_complete(iocb, iocb->nr_transferred, 0); +} + +static void generic_aio_complete_rw(int rw, void *_iocb, struct kvec *vec, ssize_t res) +{ + struct kiocb *iocb = _iocb; + + unmap_kvec(vec, rw == READ); + free_kvec(vec); + + if (res > 0) + iocb->nr_transferred += res; + + /* Was this chunk successful? Is there more left to transfer? */ + if (res == iocb->this_size && iocb->nr_transferred < iocb->size) { + /* We may be in irq context, so queue processing in + * process context. + */ + iocb->this_size = rw; + INIT_TQUEUE(&iocb->u.tq, generic_aio_next_chunk, iocb); + schedule_task(&iocb->u.tq); + return; + } + + aio_complete(iocb, iocb->nr_transferred ? iocb->nr_transferred : res, + 0); +} + +static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(READ, _iocb, vec, res); +} + +static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res) +{ + generic_aio_complete_rw(WRITE, _iocb, vec, res); +} + +ssize_t generic_aio_rw(int rw, struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size) +{ + int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t); + unsigned long buf = iocb->aio_buf; + size_t size = iocb->aio_nbytes; + size_t nr_read = 0; + loff_t pos = iocb->aio_offset; + kvec_cb_t cb; + ssize_t res; + +#if 0 + if (likely(NULL != file->f_op->new_read)) { + nr_read = file->f_op->new_read(file, (void *)buf, size, + &pos, F_ATOMIC); + dprintk("from new_read: nr_read: %ld\n", (long)nr_read); + if ((-EAGAIN == nr_read) || (-EWOULDBLOCKIO == nr_read)) + nr_read = 0; + else if ((nr_read >= min_size) || (nr_read < 0)) { + dprintk("returning nr_read: %ld\n", (long)nr_read); + return nr_read; + } + } + dprintk("nr_read: %ld\n", (long)nr_read); +#endif + + req->nr_transferred = nr_read; + size -= nr_read; + if (size > aio_max_size) + /* We have to split up the request. Pin the mm + * struct for further use with map_user_kvec later. + */ + size = aio_max_size; + else + req->buf = 0; + + req->this_size = size; + + buf += nr_read; + cb.vec = map_user_kvec(rw, buf, size); + cb.fn = (rw == READ) ? generic_aio_complete_read + : generic_aio_complete_write; + cb.data = req; + + dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec); + if (IS_ERR(cb.vec)) + return nr_read ? nr_read : PTR_ERR(cb.vec); + + kvec_op = (rw == READ) ? file->f_op->kvec_read : file->f_op->kvec_write; + + res = kvec_op(file, cb, size, pos); + if (unlikely(res != 0)) { + /* If the first chunk was successful, we have to run + * the callback to attempt the rest of the io. + */ + if (res == size && req->buf) { + cb.fn(cb.data, cb.vec, res); + return 0; + } + + unmap_kvec(cb.vec, rw == READ); + free_kvec(cb.vec); + if (nr_read) { + if (res < 0) + res = 0; + res += nr_read; + } + } + return res; +} + +ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb) +{ + return generic_aio_rw(READ, file, req, iocb, iocb->aio_nbytes); +} + +ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size) +{ + return generic_aio_rw(WRITE, file, req, iocb, 1); +#if 0 + unsigned long buf = iocb.aio_buf; + size_t size = iocb.aio_nbytes; + loff_t pos = iocb.aio_offset; + ssize_t nr_written = 0; + kvec_cb_t cb; + long res; +#if 0 + if (likely(NULL != file->f_op->new_write)) { + nr_written = file->f_op->new_write(file, (void *)buf, size, + &pos, F_ATOMIC); + pr_debug("generic_aio_write: new_write: %ld\n", (long)nr_written); + if (-EAGAIN == nr_written) + nr_written = 0; + if ((nr_written >= min_size) || (nr_written < 0)) + return nr_written; + } +#endif + + req->nr_transferred = nr_written; + size -= nr_written; + if (size > aio_max_size) + size = aio_max_size; + req->this_size = size; + buf += nr_written; + cb.vec = map_user_kvec(WRITE, buf, size); + cb.fn = generic_aio_complete_write; + cb.data = req; + + if (IS_ERR(cb.vec)) { + pr_debug("generic_aio_write: map_user_kvec: %ld\n", PTR_ERR(cb.vec)); + return nr_written ? nr_written : PTR_ERR(cb.vec); + } + + res = file->f_op->kvec_write(file, cb, size, iocb.aio_offset); + pr_debug("generic_aio_write: kvec_write: %ld\n", res); + if (unlikely(res != 0)) { + unmap_kvec(cb.vec, 0); + free_kvec(cb.vec); + if (nr_written) { + if (res < 0) + res = 0; + res += nr_written; + } + } + return res; +#endif +} + +ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb) +{ + return generic_aio_write(file, req, iocb, iocb->aio_nbytes); +} + +/* lookup_kiocb + * Finds a given iocb for cancellation. + * MUST be called with ctx->lock held. + */ +struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb *iocb, u32 key) +{ + struct list_head *pos; + /* TODO: use a hash or array, this sucks. */ + list_for_each(pos, &ctx->free_reqs) { + struct kiocb *kiocb = list_kiocb(pos); + if (kiocb->user_obj == iocb && kiocb->key == key) + return kiocb; + } + return NULL; +} + +/* sys_io_cancel: + * Attempts to cancel an iocb previously passed to io_submit. If + * the operation is successfully cancelled, the resulting event is + * copied into the memory pointed to by result without being placed + * into the completion queue and 0 is returned. May fail with + * -EFAULT if any of the data structures pointed to are invalid. + * May fail with -EINVAL if aio_context specified by ctx_id is + * invalid. May fail with -EAGAIN if the iocb specified was not + * cancelled. Will fail with -ENOSYS if not implemented. + */ +asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb, + struct io_event *result) +{ + int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct kioctx *ctx; + struct kiocb *kiocb; + u32 key; + int ret; + + ret = get_user(key, &iocb->aio_key); + if (unlikely(ret)) + return -EFAULT; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); + if (kiocb && kiocb->cancel) { + cancel = kiocb->cancel; + kiocb->users ++; + } else + cancel = NULL; + spin_unlock_irq(&ctx->lock); + + if (NULL != cancel) { + struct io_event tmp; + ret = cancel(kiocb, &tmp); + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &tmp, sizeof(tmp))) + ret = -EFAULT; + } + } else + dprintk(KERN_DEBUG "iocb has no cancel operation\n"); + + put_ioctx(ctx); + + return ret; +} + +/* io_getevents: + * Attempts to read at least min_nr events and up to nr events from + * the completion queue for the aio_context specified by ctx_id. May + * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, + * if nr is out of range, if when is out of range. May fail with + * -EFAULT if any of the memory specified to is invalid. May return + * 0 or < min_nr if no events are available and the timeout specified + * by when has elapsed, where when == NULL specifies an infinite + * timeout. Note that the timeout pointed to by when is relative and + * will be updated if not NULL and the operation blocks. Will fail + * with -ENOSYS if not implemented. + */ +asmlinkage long sys_io_getevents(aio_context_t ctx_id, + long min_nr, + long nr, + struct io_event *events, + struct timespec *timeout) +{ + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (unlikely(min_nr > nr || min_nr < 0 || nr < 0)) + return ret; + + if (likely(NULL != ioctx)) { + ret = read_events(ioctx, min_nr, nr, events, timeout); + put_ioctx(ioctx); + } + + return ret; +} + +__initcall(aio_setup); + +EXPORT_SYMBOL(generic_file_kvec_read); +EXPORT_SYMBOL(generic_file_aio_read); +EXPORT_SYMBOL(generic_file_kvec_write); +EXPORT_SYMBOL(generic_file_aio_write); diff -urNp x-ref/fs/buffer.c x/fs/buffer.c --- x-ref/fs/buffer.c 2003-02-14 05:22:27.000000000 +0100 +++ x/fs/buffer.c 2003-02-14 05:22:30.000000000 +0100 @@ -3067,3 +3067,220 @@ static int __init bdflush_init(void) module_init(bdflush_init) +/* async kio interface */ +struct brw_cb { + kvec_cb_t cb; + atomic_t io_count; + int nr; + struct buffer_head *bh[1]; +}; + +static inline void brw_cb_put(struct brw_cb *brw_cb) +{ + if (atomic_dec_and_test(&brw_cb->io_count)) { + ssize_t res = 0, err = 0; + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (!err && buffer_uptodate(bh)) + res += bh->b_size; + else + err = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (!res) + res = err; + + brw_cb->cb.fn(brw_cb->cb.data, brw_cb->cb.vec, res); + + kfree(brw_cb); + } +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) +{ + struct brw_cb *brw_cb; + + mark_buffer_uptodate(bh, uptodate); + + brw_cb = bh->b_private; + unlock_buffer(bh); + + brw_cb_put(brw_cb); +} + + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift) +{ + struct kvec *vec = cb.vec; + struct kveclet *veclet; + int err; + int length; + unsigned sector_size = 1 << sector_shift; + int i; + + struct brw_cb *brw_cb; + + if (!vec->nr) + BUG(); + + /* + * First, do some alignment and validity checks + */ + length = 0; + for (veclet=vec->veclet, i=0; i < vec->nr; i++,veclet++) { + length += veclet->length; + if ((veclet->offset & (sector_size-1)) || + (veclet->length & (sector_size-1))) { + printk("brw_kiovec_async: tuple[%d]->offset=0x%x length=0x%x sector_size: 0x%x\n", i, veclet->offset, veclet->length, sector_size); + return -EINVAL; + } + } + + if (length < (blocks << sector_shift)) + BUG(); + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + err = 0; + + if (!blocks) { + printk("brw_kiovec_async: !i\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb->cb = cb; + brw_cb->nr = 0; + + /* This is ugly. FIXME. */ + for (i=0, veclet=vec->veclet; inr; i++,veclet++) { + struct page *page = veclet->page; + unsigned offset = veclet->offset; + unsigned length = veclet->length; + + if (!page) + BUG(); + + while (length > 0) { + struct buffer_head *tmp; + tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO); + err = -ENOMEM; + if (!tmp) + goto error; + + tmp->b_dev = B_FREE; + tmp->b_size = sector_size; + set_bh_page(tmp, page, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blknr++; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) + | (1 << BH_Req); + tmp->b_private = brw_cb; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } + + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + + if (brw_cb->nr >= blocks) + goto submit; + } /* End of block loop */ + } /* End of page loop */ + +submit: + atomic_set(&brw_cb->io_count, brw_cb->nr+1); + /* okay, we've setup all our io requests, now fire them off! */ + for (i=0; inr; i++) + submit_bh(rw, brw_cb->bh[i]); + brw_cb_put(brw_cb); + + return 0; + +error: + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (i=0; inr; i++) + kmem_cache_free(bh_cachep, brw_cb->bh[i]); + kfree(brw_cb); + } + + return err; +} +#if 0 +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { + if (!err) + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; + } + +out: + return transferred ? transferred : err; +} +#endif diff -urNp x-ref/fs/exec.c x/fs/exec.c --- x-ref/fs/exec.c 2003-02-14 05:22:15.000000000 +0100 +++ x/fs/exec.c 2003-02-14 05:22:30.000000000 +0100 @@ -425,6 +425,7 @@ static int exec_mmap(void) old_mm = current->mm; if (old_mm && atomic_read(&old_mm->mm_users) == 1) { mm_release(); + exit_aio(old_mm); exit_mmap(old_mm); return 0; } diff -urNp x-ref/fs/ext2/file.c x/fs/ext2/file.c --- x-ref/fs/ext2/file.c 2002-01-22 18:54:59.000000000 +0100 +++ x/fs/ext2/file.c 2003-02-14 05:22:30.000000000 +0100 @@ -40,6 +40,8 @@ static int ext2_release_file (struct ino */ struct file_operations ext2_file_operations = { llseek: generic_file_llseek, + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, read: generic_file_read, write: generic_file_write, ioctl: ext2_ioctl, @@ -47,6 +49,8 @@ struct file_operations ext2_file_operati open: generic_file_open, release: ext2_release_file, fsync: ext2_sync_file, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations ext2_file_inode_operations = { diff -urNp x-ref/fs/ext3/file.c x/fs/ext3/file.c --- x-ref/fs/ext3/file.c 2002-11-29 02:23:15.000000000 +0100 +++ x/fs/ext3/file.c 2003-02-14 05:22:30.000000000 +0100 @@ -111,6 +111,8 @@ force_commit: struct file_operations ext3_file_operations = { llseek: generic_file_llseek, /* BKL held */ + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, /* FIXME: attributes */ read: generic_file_read, /* BKL not held. Don't need */ write: ext3_file_write, /* BKL not held. Don't need */ ioctl: ext3_ioctl, /* BKL held */ @@ -118,6 +120,8 @@ struct file_operations ext3_file_operati open: ext3_open_file, /* BKL not held. Don't need */ release: ext3_release_file, /* BKL not held. Don't need */ fsync: ext3_sync_file, /* BKL held */ + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations ext3_file_inode_operations = { diff -urNp x-ref/fs/file_table.c x/fs/file_table.c --- x-ref/fs/file_table.c 2002-11-29 02:23:15.000000000 +0100 +++ x/fs/file_table.c 2003-02-14 05:22:30.000000000 +0100 @@ -97,33 +97,37 @@ int init_private_file(struct file *filp, return 0; } -void fput(struct file * file) +inline void __fput(struct file * file) { struct dentry * dentry = file->f_dentry; struct vfsmount * mnt = file->f_vfsmnt; struct inode * inode = dentry->d_inode; - if (atomic_dec_and_test(&file->f_count)) { - locks_remove_flock(file); + locks_remove_flock(file); - if (file->f_iobuf) - free_kiovec(1, &file->f_iobuf); + if (file->f_iobuf) + free_kiovec(1, &file->f_iobuf); - if (file->f_op && file->f_op->release) - file->f_op->release(inode, file); - fops_put(file->f_op); - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); - file_list_lock(); - file->f_dentry = NULL; - file->f_vfsmnt = NULL; - list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; - file_list_unlock(); - dput(dentry); - mntput(mnt); - } + if (file->f_op && file->f_op->release) + file->f_op->release(inode, file); + fops_put(file->f_op); + if (file->f_mode & FMODE_WRITE) + put_write_access(inode); + file_list_lock(); + file->f_dentry = NULL; + file->f_vfsmnt = NULL; + list_del(&file->f_list); + list_add(&file->f_list, &free_list); + files_stat.nr_free_files++; + file_list_unlock(); + dput(dentry); + mntput(mnt); +} + +void fput(struct file * file) +{ + if (atomic_dec_and_test(&file->f_count)) + __fput(file); } struct file * fget(unsigned int fd) diff -urNp x-ref/fs/jfs/file.c x/fs/jfs/file.c --- x-ref/fs/jfs/file.c 2003-01-29 06:14:11.000000000 +0100 +++ x/fs/jfs/file.c 2003-02-14 05:22:30.000000000 +0100 @@ -102,4 +102,8 @@ struct file_operations jfs_file_operatio .mmap = generic_file_mmap, .fsync = jfs_fsync, .release = jfs_release, + .kvec_read = generic_file_kvec_read, + .kvec_write = generic_file_kvec_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, }; diff -urNp x-ref/fs/nfs/file.c x/fs/nfs/file.c --- x-ref/fs/nfs/file.c 2003-02-14 05:22:16.000000000 +0100 +++ x/fs/nfs/file.c 2003-02-14 05:22:30.000000000 +0100 @@ -40,9 +40,13 @@ static ssize_t nfs_file_read(struct file static ssize_t nfs_file_write(struct file *, const char *, size_t, loff_t *); static int nfs_file_flush(struct file *); static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); +static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos); +static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos); struct file_operations nfs_file_operations = { llseek: generic_file_llseek, + kvec_read: nfs_kvec_read, + kvec_write: nfs_kvec_write, read: nfs_file_read, write: nfs_file_write, mmap: nfs_file_mmap, @@ -51,6 +55,8 @@ struct file_operations nfs_file_operatio release: nfs_release, fsync: nfs_fsync, lock: nfs_lock, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; struct inode_operations nfs_file_inode_operations = { @@ -89,6 +95,28 @@ nfs_file_flush(struct file *file) return status; } +static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos) +{ + struct dentry * dentry = file->f_dentry; + struct inode * inode = dentry->d_inode; + int ret; + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!ret) + return generic_file_kvec_write(file, cb, count, pos); + return ret; +} + +static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos) +{ + struct dentry * dentry = file->f_dentry; + struct inode * inode = dentry->d_inode; + int ret; + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!ret) + return generic_file_kvec_read(file, cb, count, pos); + return ret; +} + static ssize_t nfs_file_read(struct file * file, char * buf, size_t count, loff_t *ppos) { diff -urNp x-ref/fs/reiserfs/file.c x/fs/reiserfs/file.c --- x-ref/fs/reiserfs/file.c 2002-11-29 02:23:16.000000000 +0100 +++ x/fs/reiserfs/file.c 2003-02-14 05:22:30.000000000 +0100 @@ -136,6 +136,10 @@ struct file_operations reiserfs_file_ope mmap: generic_file_mmap, release: reiserfs_file_release, fsync: reiserfs_sync_file, + kvec_read: generic_file_kvec_read, + kvec_write: generic_file_kvec_write, + aio_read: generic_file_aio_read, + aio_write: generic_file_aio_write, }; diff -urNp x-ref/include/asm-i386/kmap_types.h x/include/asm-i386/kmap_types.h --- x-ref/include/asm-i386/kmap_types.h 2002-11-29 02:23:16.000000000 +0100 +++ x/include/asm-i386/kmap_types.h 2003-02-14 05:22:30.000000000 +0100 @@ -8,6 +8,8 @@ enum km_type { KM_USER0, KM_USER1, KM_BH_IRQ, + KM_IRQ0, + KM_IRQ1, KM_TYPE_NR }; diff -urNp x-ref/include/asm-i386/semaphore.h x/include/asm-i386/semaphore.h --- x-ref/include/asm-i386/semaphore.h 2002-11-29 02:23:16.000000000 +0100 +++ x/include/asm-i386/semaphore.h 2003-02-14 05:22:30.000000000 +0100 @@ -131,6 +131,31 @@ static inline void down(struct semaphore :"memory"); } +/* Returns 0 if we acquired the semaphore, 1 if it was queued. */ +struct worktodo; +static inline int wtd_down(struct worktodo *wtd, struct semaphore *sem) +{ + int ret = 0; +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + + __asm__ __volatile__( + "# atomic down operation\n\t" + LOCK "decl %0\n\t" /* --sem->count */ + "js 2f\n" + "1:\n" + LOCK_SECTION_START("") + "2:\tcall __wtd_down_failed\n\t" + "movl $1,%1\n\t" + "jmp 1b\n" + LOCK_SECTION_END + :"=m" (sem->count), "=r" (ret) + :"c" (sem), "1" (ret), "d" (wtd) + :"memory"); + return ret; +} + /* * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR diff -urNp x-ref/include/asm-ppc/kmap_types.h x/include/asm-ppc/kmap_types.h --- x-ref/include/asm-ppc/kmap_types.h 2002-11-29 02:23:17.000000000 +0100 +++ x/include/asm-ppc/kmap_types.h 2003-02-14 05:22:30.000000000 +0100 @@ -12,6 +12,8 @@ enum km_type { KM_USER0, KM_USER1, KM_BH_IRQ, + KM_IRQ0, + KM_IRQ1, KM_TYPE_NR }; diff -urNp x-ref/include/asm-x86_64/kmap_types.h x/include/asm-x86_64/kmap_types.h --- x-ref/include/asm-x86_64/kmap_types.h 2003-01-29 06:14:23.000000000 +0100 +++ x/include/asm-x86_64/kmap_types.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,13 +0,0 @@ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -enum km_type { - KM_BOUNCE_READ, - KM_SKB_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_TYPE_NR -}; - -#endif diff -urNp x-ref/include/asm-x86_64/semaphore.h x/include/asm-x86_64/semaphore.h --- x-ref/include/asm-x86_64/semaphore.h 2003-01-29 06:14:23.000000000 +0100 +++ x/include/asm-x86_64/semaphore.h 2003-02-14 05:22:30.000000000 +0100 @@ -133,6 +133,31 @@ static inline void down(struct semaphore :"memory"); } +/* Returns 0 if we acquired the semaphore, 1 if it was queued. */ +struct worktodo; +static inline int wtd_down(struct worktodo *wtd, struct semaphore *sem) +{ + int ret = 0; +#if WAITQUEUE_DEBUG + CHECK_MAGIC(sem->__magic); +#endif + + __asm__ __volatile__( + "# atomic down operation\n\t" + LOCK "decl %0\n\t" /* --sem->count */ + "js 2f\n" + "1:\n" + LOCK_SECTION_START("") + "2:\tcall __wtd_down_failed\n\t" + "movl $1,%1\n\t" + "jmp 1b\n" + LOCK_SECTION_END + :"=m" (sem->count), "=r" (ret) + :"D" (sem), "1" (ret), "S" (wtd) + :"memory"); + return ret; +} + /* * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR @@ -215,3 +240,4 @@ static inline void up(struct semaphore * } #endif /* __KERNEL__ */ #endif + diff -urNp x-ref/include/linux/aio.h x/include/linux/aio.h --- x-ref/include/linux/aio.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/aio.h 2003-02-14 05:22:30.000000000 +0100 @@ -0,0 +1,127 @@ +#ifndef __LINUX__AIO_H +#define __LINUX__AIO_H + +#include +#include +#include +#include + +#include + +#define AIO_MAXSEGS 4 +#define AIO_KIOGRP_NR_ATOMIC 8 + +struct kioctx; + +/* Notes on cancelling a kiocb: + * If a kiocb is cancelled, aio_complete may return 0 to indicate + * that cancel has not yet disposed of the kiocb. All cancel + * operations *must* call aio_put_req to dispose of the kiocb + * to guard against races with the completion code. + */ +#define KIOCB_C_CANCELLED 0x01 +#define KIOCB_C_COMPLETE 0x02 + +struct kiocb { + struct list_head list; + struct file *filp; + struct kioctx *ctx; + void *user_obj; + __u64 user_data; + loff_t pos; + unsigned long buf; + size_t nr_transferred; /* used for chunking */ + size_t size; + size_t this_size; + unsigned key; /* id of this request */ + int (*cancel)(struct kiocb *, struct io_event *); + void *data; /* for use by the the async op */ + int users; + union { + struct tq_struct tq; /* argh. */ + struct list_head list; + } u; +}; + +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned woke; /* set when a wakeup was sent */ + unsigned pad[3]; + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define aio_ring_avail(info, ring) (((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr) + +#define AIO_RING_PAGES 8 +struct aio_ring_info { + //struct file *mmap_file; + struct kvec *kvec; + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + spinlock_t ring_lock; + unsigned nr_pages; + + unsigned nr, tail; + + struct page *internal_pages[AIO_RING_PAGES]; +}; + +struct kioctx { + atomic_t users; + int dead; + struct mm_struct *mm; + + /* This needs improving */ + unsigned long user_id; + struct kioctx *next; + + wait_queue_head_t wait; + + spinlock_t lock; + + int reqs_active; + struct list_head free_reqs; + struct list_head active_reqs; /* used for cancellation */ + + unsigned max_reqs; + + struct aio_ring_info ring_info; +}; + +/* prototypes */ +extern unsigned aio_max_size; + +extern int FASTCALL(aio_put_req(struct kiocb *iocb)); +extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2)); +extern void FASTCALL(__put_ioctx(struct kioctx *ctx)); +struct mm_struct; +extern void FASTCALL(exit_aio(struct mm_struct *mm)); + +#define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0) +#define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0) + +#include + +static inline struct kiocb *list_kiocb(struct list_head *h) +{ + return list_entry(h, struct kiocb, list); +} + +struct file; +extern ssize_t generic_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size); +extern ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size); +extern ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); +extern ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); + +/* for sysctl: */ +extern unsigned aio_nr, aio_max_nr, aio_max_size, aio_max_pinned; + +#endif /* __LINUX__AIO_H */ diff -urNp x-ref/include/linux/aio_abi.h x/include/linux/aio_abi.h --- x-ref/include/linux/aio_abi.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/aio_abi.h 2003-02-14 05:22:30.000000000 +0100 @@ -0,0 +1,91 @@ +/* linux/aio_abi.h + * + * Copyright 2000,2001,2002 Red Hat. + * + * Written by Benjamin LaHaise + * + * Distribute under the terms of the GPLv2 (see ../../COPYING) or under + * the following terms. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation is hereby granted, provided that the above copyright + * notice appears in all copies. This software is provided without any + * warranty, express or implied. Red Hat makes no representations about + * the suitability of this software for any purpose. + * + * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, + * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF + * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND + * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, + * ENHANCEMENTS, OR MODIFICATIONS. + */ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +#include + +typedef unsigned long aio_context_t; + +enum { + IOCB_CMD_PREAD = 0, + IOCB_CMD_PWRITE = 1, + IOCB_CMD_FSYNC = 2, + IOCB_CMD_FDSYNC = 3, + /* These two are experimental. + * IOCB_CMD_PREADX = 4, + * IOCB_CMD_POLL = 5, + */ + IOCB_CMD_NOOP = 6, +}; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#if defined(__LITTLE_ENDIAN) +#define PADDED(x,y) x, y +#elif defined(__BIG_ENDIAN) +#define PADDED(x,y) y, x +#else +#error edit for your odd byteorder. +#endif + +/* + * we always use a 64bit off_t when communicating + * with userland. its up to libraries to do the + * proper padding and aio_error abstraction + */ + +struct iocb { + /* these are internal to the kernel/libc. */ + __u64 aio_data; /* data to be returned in event's data */ + __u32 PADDED(aio_key, aio_reserved1); + /* the kernel sets aio_key to the req # */ + + /* common fields */ + __u16 aio_lio_opcode; /* see IOCB_CMD_ above */ + __s16 aio_reqprio; + __u32 aio_fildes; + + __u64 aio_buf; + __u64 aio_nbytes; + __s64 aio_offset; + + /* extra parameters */ + __u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */ + __u64 aio_reserved3; +}; /* 64 bytes */ + +#undef IFBIG +#undef IFLITTLE + +#endif /* __LINUX__AIO_ABI_H */ diff -urNp x-ref/include/linux/brlock.h x/include/linux/brlock.h --- x-ref/include/linux/brlock.h 2003-02-14 05:22:25.000000000 +0100 +++ x/include/linux/brlock.h 2003-02-14 05:22:30.000000000 +0100 @@ -34,6 +34,7 @@ enum brlock_indices { BR_GLOBALIRQ_LOCK, BR_NETPROTO_LOCK, + BR_AIO_REQ_LOCK, __BR_END }; diff -urNp x-ref/include/linux/file.h x/include/linux/file.h --- x-ref/include/linux/file.h 2002-08-09 14:52:29.000000000 +0200 +++ x/include/linux/file.h 2003-02-14 05:22:30.000000000 +0100 @@ -5,6 +5,7 @@ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H +extern void FASTCALL(__fput(struct file *)); extern void FASTCALL(fput(struct file *)); extern struct file * FASTCALL(fget(unsigned int fd)); diff -urNp x-ref/include/linux/fs.h x/include/linux/fs.h --- x-ref/include/linux/fs.h 2003-02-14 05:22:26.000000000 +0100 +++ x/include/linux/fs.h 2003-02-14 05:22:30.000000000 +0100 @@ -198,6 +198,8 @@ extern int leases_enable, dir_notify_ena #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ #ifdef __KERNEL__ +#include +#include #include #include @@ -950,6 +952,15 @@ struct file_operations { ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + + /* in-kernel fully async api */ + int (*kvec_read)(struct file *, kvec_cb_t, size_t, loff_t); + int (*kvec_write)(struct file *, kvec_cb_t, size_t, loff_t); + + /* userland aio ops */ + ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb *); + ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb *); + ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb *); }; struct inode_operations { @@ -1604,6 +1615,8 @@ extern ssize_t generic_file_write_nolock extern void __do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t, int); #define do_generic_file_read(filp, ppos, desc, actor) __do_generic_file_read(filp, ppos, desc, actor, 0) #define do_generic_file_read_atomic(filp, ppos, desc, actor) __do_generic_file_read(filp, ppos, desc, actor, 1) +extern int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); +extern int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); diff -urNp x-ref/include/linux/highmem.h x/include/linux/highmem.h --- x-ref/include/linux/highmem.h 2003-02-14 05:22:26.000000000 +0100 +++ x/include/linux/highmem.h 2003-02-14 05:22:30.000000000 +0100 @@ -82,6 +82,18 @@ static inline void *kmap(struct page *pa #define bh_kmap_irq(bh, flags) ((bh)->b_data) #define bh_kunmap_irq(bh, flags) do { *(flags) = 0; } while (0) +enum km_type { + KM_BOUNCE_READ, + KM_SKB_SUNRPC_DATA, + KM_SKB_DATA_SOFTIRQ, + KM_USER0, + KM_USER1, + KM_BH_IRQ, + KM_IRQ0, + KM_IRQ1, + KM_TYPE_NR +}; + #endif /* CONFIG_HIGHMEM */ /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ diff -urNp x-ref/include/linux/kiovec.h x/include/linux/kiovec.h --- x-ref/include/linux/kiovec.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/kiovec.h 2003-02-14 05:22:30.000000000 +0100 @@ -0,0 +1,155 @@ +#ifndef __LINUX__KIOVEC_H +#define __LINUX__KIOVEC_H + +struct page; +struct mm_struct; +#include + +struct kveclet { + struct page *page; + unsigned offset; + unsigned length; +}; + +struct kvec { + unsigned max_nr; + unsigned nr; + struct kveclet veclet[0]; +}; + +struct kvec_cb { + struct kvec *vec; + void (*fn)(void *data, struct kvec *vec, ssize_t res); + void *data; +}; + +struct kvec_cb_list { + struct list_head list; + struct kvec_cb cb; +}; + +#ifndef _LINUX_TYPES_H +#include +#endif +#ifndef _LINUX_KDEV_T_H +#include +#endif +#ifdef CONFIG_HIGHMEM +#include +#endif + +extern struct kvec *FASTCALL(map_user_kvec(int rw, unsigned long va, size_t len)); +extern struct kvec *FASTCALL(mm_map_user_kvec(struct mm_struct *, int rw, + unsigned long va, size_t len)); +extern void FASTCALL(unmap_kvec(struct kvec *, int dirtied)); +extern void FASTCALL(free_kvec(struct kvec *)); + +/* brw_kvec_async: + * Performs direct io to/from disk into cb.vec. Count is the number + * of sectors to read, sector_shift is the blocksize (which must be + * compatible with the kernel's current idea of the device's sector + * size) in log2. blknr is the starting sector offset on dev. + * + */ +extern int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned count, + unsigned long blknr, int sector_shift); + +/* Memory copy helpers usage: + * void foo(... struct kveclet *veclet...) + * + * struct kvec_dst dst; + * + * kvec_dst_init(&dst); -- resets type + * kvec_dst_set(&dst, veclet); -- set target & clear offset + * kvec_dst_map(&dst); -- activates kmap + * for (...) + * memcpy_to_kvec_dst(&dst, data, size); -- each copy appends + * kvec_dst_unmap(&dst); -- releases kmap + * + * Note that scheduling is not permitted between kvec_dst_map() and + * kvec_dst_unmap(). This is because internally the routines make use + * of an atomic kmap. + */ +struct kvec_dst { + char *addr; + char *dst; + struct kveclet *let; + int space; + int offset; +#ifdef CONFIG_HIGHMEM + enum km_type type; +#endif +}; + + +#define kvec_dst_set(Xdst, Xlet) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = (Xlet); \ + _dst->let = _let; \ + _dst->space = _let->length; \ + _dst->offset = 0; \ + } while(0) + +#ifdef CONFIG_HIGHMEM +#define kvec_dst_map(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = _dst->let; \ + _dst->dst = _dst->addr = kmap_atomic(_let->page, _dst->type);\ + _dst->dst += _let->offset + _dst->offset; \ + _dst->space = _let->length - _dst->offset; \ + _dst->offset = 0; \ + } while(0) + +#define kvec_dst_init(Xdst) \ + do { \ + (Xdst)->space = 0; \ + (Xdst)->addr = 0; \ + (Xdst)->offset = 0; \ + (Xdst)->type = KM_USER0; \ + } while(0) + +#define kvec_dst_unmap(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + kunmap_atomic(_dst->addr, _dst->type); \ + _dst->offset = _dst->dst - _dst->addr; \ + _dst->offset -= _dst->let->offset; \ + _dst->addr = NULL; \ + } while(0) +#else +#define kvec_dst_map(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + struct kveclet *_let = _dst->let; \ + _dst->dst = _dst->addr = page_address(_let->page); \ + _dst->dst += _let->offset + _dst->offset; \ + _dst->space = _let->length - _dst->offset; \ + _dst->offset = 0; \ + } while(0) + +#define kvec_dst_init(Xdst) \ + do { \ + (Xdst)->space = 0; \ + (Xdst)->addr = 0; \ + (Xdst)->offset = 0; \ + } while(0) + +#define kvec_dst_unmap(Xdst) \ + do { \ + struct kvec_dst *_dst = (Xdst); \ + _dst->offset = _dst->dst - _dst->addr; \ + _dst->offset -= _dst->let->offset; \ + _dst->addr = NULL; \ + } while(0) +#endif + +extern void FASTCALL(memcpy_to_kvec_dst(struct kvec_dst *dst, + const char *from, long len)); +extern void FASTCALL(memcpy_from_kvec_dst(char *to, + struct kvec_dst *from, long len)); +extern int FASTCALL(copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len)); + + +#endif /* __LINUX__KIOVEC_H */ diff -urNp x-ref/include/linux/list.h x/include/linux/list.h --- x-ref/include/linux/list.h 2003-02-14 05:22:15.000000000 +0100 +++ x/include/linux/list.h 2003-02-14 05:22:30.000000000 +0100 @@ -224,6 +224,8 @@ static inline void list_splice_init(list pos = list_entry(pos->member.next, typeof(*pos), member), \ prefetch(pos->member.next)) +#define list_first(head) (((head)->next != (head)) ? (head)->next: (struct list_head *) 0) + #endif /* __KERNEL__ || _LVM_H_INCLUDE */ #endif diff -urNp x-ref/include/linux/mm.h x/include/linux/mm.h --- x-ref/include/linux/mm.h 2003-02-14 05:22:26.000000000 +0100 +++ x/include/linux/mm.h 2003-02-14 05:22:30.000000000 +0100 @@ -712,9 +712,9 @@ static inline int expand_stack(struct vm } /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); -extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, - struct vm_area_struct **pprev); +extern struct vm_area_struct * FASTCALL(find_vma(struct mm_struct * mm, unsigned long addr)); +extern struct vm_area_struct * FASTCALL(find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev)); /* Look up the first VMA which intersects the interval start_addr..end_addr-1, NULL if none. Assume start_addr < end_addr. */ diff -urNp x-ref/include/linux/pagemap.h x/include/linux/pagemap.h --- x-ref/include/linux/pagemap.h 2003-02-14 05:22:13.000000000 +0100 +++ x/include/linux/pagemap.h 2003-02-14 05:22:30.000000000 +0100 @@ -88,6 +88,7 @@ extern struct page *find_trylock_page(st extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index); extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index); extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash); +extern wait_queue_head_t *FASTCALL(page_waitqueue(struct page *page)); extern void ___wait_on_page(struct page *); diff -urNp x-ref/include/linux/sched.h x/include/linux/sched.h --- x-ref/include/linux/sched.h 2003-02-14 05:22:26.000000000 +0100 +++ x/include/linux/sched.h 2003-02-14 05:22:30.000000000 +0100 @@ -223,6 +223,7 @@ struct files_struct { extern int max_map_count; +struct kioctx; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ rb_root_t mm_rb; @@ -251,6 +252,8 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + + struct kioctx *ioctx_list; }; extern int mmlist_nr; @@ -832,6 +835,7 @@ extern int do_fork(unsigned long, unsign extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void wait_task_inactive(task_t * p); diff -urNp x-ref/include/linux/sysctl.h x/include/linux/sysctl.h --- x-ref/include/linux/sysctl.h 2003-02-14 05:22:25.000000000 +0100 +++ x/include/linux/sysctl.h 2003-02-14 05:22:30.000000000 +0100 @@ -599,8 +599,7 @@ enum { /* CTL_PROC names: */ /* CTL_FS names: */ -enum -{ +enum { FS_NRINODE=1, /* int:current number of allocated inodes */ FS_STATINODE=2, FS_MAXINODE=3, /* int:maximum number of inodes that can be allocated */ @@ -618,6 +617,10 @@ enum FS_LEASE_TIME=15, /* int: maximum time to wait for a lease break */ FS_DQSTATS=16, /* dir: disc quota usage statistics */ FS_XFS=17, /* struct: control xfs parameters */ + FS_AIO_NR=18, /* int: current number of aio requests */ + FS_AIO_MAX_NR=19, /* int: max system wide aio requests */ + FS_AIO_MAX_SIZE=20, /* int: max size of read/write chunks */ + FS_AIO_MAX_PINNED=21, /* long: max memory pinned (in pages) */ }; /* /proc/sys/fs/quota/ */ diff -urNp x-ref/include/linux/tqueue.h x/include/linux/tqueue.h --- x-ref/include/linux/tqueue.h 2002-12-18 23:58:19.000000000 +0100 +++ x/include/linux/tqueue.h 2003-02-14 05:22:30.000000000 +0100 @@ -68,6 +68,9 @@ typedef struct list_head task_queue; extern task_queue tq_timer, tq_immediate, tq_disk; +/* same as run_task_queue(&tq_disk) but async, from wtd.c */ +extern void async_run_tq_disk(void); + /* * To implement your own list of active bottom halfs, use the following * two definitions: diff -urNp x-ref/include/linux/types.h x/include/linux/types.h --- x-ref/include/linux/types.h 2002-12-18 23:51:41.000000000 +0100 +++ x/include/linux/types.h 2003-02-14 05:22:30.000000000 +0100 @@ -127,4 +127,9 @@ struct ustat { char f_fpack[6]; }; +/* kernel typedefs -- they belong here. */ +#ifdef __KERNEL__ +typedef struct kvec_cb kvec_cb_t; +#endif /* __KERNEL__ */ + #endif /* _LINUX_TYPES_H */ diff -urNp x-ref/include/linux/wait.h x/include/linux/wait.h --- x-ref/include/linux/wait.h 2003-02-14 05:22:11.000000000 +0100 +++ x/include/linux/wait.h 2003-02-14 05:22:30.000000000 +0100 @@ -28,17 +28,20 @@ #define WAITQUEUE_DEBUG 0 #endif +typedef struct __wait_queue wait_queue_t; +typedef void (*wait_queue_func_t)(wait_queue_t *wait); + struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 struct task_struct * task; struct list_head task_list; + wait_queue_func_t func; #if WAITQUEUE_DEBUG long __magic; long __waker; #endif }; -typedef struct __wait_queue wait_queue_t; /* * 'dual' spinlock architecture. Can be switched between spinlock_t and @@ -139,6 +142,7 @@ typedef struct __wait_queue_head wait_qu #endif #define __WAITQUEUE_INITIALIZER(name, tsk) { \ + func: NULL, \ task: tsk, \ task_list: { NULL, NULL }, \ __WAITQUEUE_DEBUG_INIT(name)} @@ -176,6 +180,22 @@ static inline void init_waitqueue_entry( #endif q->flags = 0; q->task = p; + q->func = NULL; +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; +#endif +} + +static inline void init_waitqueue_func_entry(wait_queue_t *q, + wait_queue_func_t func) +{ +#if WAITQUEUE_DEBUG + if (!q || !p) + WQ_BUG(); +#endif + q->flags = 0; + q->task = NULL; + q->func = func; #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif @@ -233,6 +253,38 @@ static inline void __remove_wait_queue(w list_del(&old->task_list); } +#define add_wait_queue_cond(q, wait, cond) \ + ({ \ + unsigned long flags; \ + int _raced = 0; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = 0; \ + __add_wait_queue((q), (wait)); \ + mb(); \ + if (!(cond)) { \ + _raced = 1; \ + __remove_wait_queue((q), (wait)); \ + } \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + _raced; \ + }) + +#define add_wait_queue_exclusive_cond(q, wait, cond) \ + ({ \ + unsigned long flags; \ + int _raced = 0; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = WQ_FLAG_EXCLUSIVE; \ + __add_wait_queue_tail((q), (wait)); \ + mb(); \ + if (!(cond)) { \ + _raced = 1; \ + __remove_wait_queue((q), (wait)); \ + } \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + _raced; \ + }) + #endif /* __KERNEL__ */ #endif diff -urNp x-ref/include/linux/worktodo.h x/include/linux/worktodo.h --- x-ref/include/linux/worktodo.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/worktodo.h 2003-02-14 05:22:30.000000000 +0100 @@ -0,0 +1,76 @@ +/* + * Written by Benjamin LaHaise. + * + * Copyright 2000-2001 Red Hat, Inc. + * + * #include "gpl.h" + * + * Basic design idea from Jeff Merkey. + * Stack based on ideas from Ingo Molnar. + */ +#ifndef __LINUX__WORKTODO_H +#define __LINUX__WORKTODO_H + +#ifndef _LINUX_WAIT_H +#include +#endif +#ifndef _LINUX_TQUEUE_H +#include +#endif + +struct wtd_stack { + void (*fn)(void *data); + void *data; +}; + +struct worktodo { + wait_queue_t wait; + struct tq_struct tq; + + void *data; /* for use by the wtd_ primatives */ + + int sp; + struct wtd_stack stack[3]; +}; + +/* FIXME NOTE: factor from kernel/context.c */ +#define wtd_init(wtd, routine) do { \ + INIT_TQUEUE(&(wtd)->tq, (routine), (wtd)); \ + (wtd)->data = 0; \ + (wtd)->sp = 0; \ +} while (0) + +#define wtd_queue(wtd) schedule_task(&(wtd)->tq) + +#define wtd_push(wtd, action, wtddata) \ +do { \ + (wtd)->stack[(wtd)->sp].fn = (wtd)->tq.routine; \ + (wtd)->stack[(wtd)->sp++].data = (wtd)->tq.data;\ + (wtd)->tq.routine = action; \ + (wtd)->tq.data = wtddata; \ +} while (0) + +static inline void wtd_pop(struct worktodo *wtd) +{ + if (wtd->sp) { + wtd->sp--; + wtd->tq.routine = wtd->stack[wtd->sp].fn; + wtd->tq.data = wtd->stack[wtd->sp].data; + } +} + +#define wtd_set_action(wtd, action, wtddata) INIT_TQUEUE(&(wtd)->tq, action, wtddata) + +struct page; +struct buffer_head; +struct semaphore; +extern int wtd_lock_page(struct worktodo *wtd, struct page *page); +extern int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh); + +#if 0 /* not implemented yet */ +extern int wtd_down(struct worktodo *wtd, struct semaphore *sem); +extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem); +extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem); +#endif + +#endif /* __LINUX__WORKTODO_H */ diff -urNp x-ref/kernel/fork.c x/kernel/fork.c --- x-ref/kernel/fork.c 2003-02-14 05:22:21.000000000 +0100 +++ x/kernel/fork.c 2003-02-14 05:22:30.000000000 +0100 @@ -54,6 +54,16 @@ void add_wait_queue(wait_queue_head_t *q wq_write_unlock_irqrestore(&q->lock, flags); } +void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wq_write_lock_irqsave(&q->lock, flags); + wait->flags = WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); + wq_write_unlock_irqrestore(&q->lock, flags); +} + void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; @@ -278,6 +288,7 @@ int mmlist_nr; static struct mm_struct * mm_init(struct mm_struct * mm) { + mm->ioctx_list = NULL; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); @@ -313,6 +324,7 @@ struct mm_struct * mm_alloc(void) */ inline void __mmdrop(struct mm_struct *mm) { + BUG_ON(mm->ioctx_list); BUG_ON(mm == &init_mm); pgd_free(mm->pgd); check_pgt_cache(); @@ -332,6 +344,7 @@ void mmput(struct mm_struct *mm) list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); + exit_aio(mm); exit_mmap(mm); mmdrop(mm); } diff -urNp x-ref/kernel/ksyms.c x/kernel/ksyms.c --- x-ref/kernel/ksyms.c 2003-02-14 05:22:26.000000000 +0100 +++ x/kernel/ksyms.c 2003-02-14 05:22:30.000000000 +0100 @@ -446,6 +446,13 @@ EXPORT_SYMBOL(unlock_kiovec); EXPORT_SYMBOL(brw_kiovec); EXPORT_SYMBOL(kiobuf_wait_for_io); +/* kvecs */ +EXPORT_SYMBOL(map_user_kvec); +EXPORT_SYMBOL(unmap_kvec); +EXPORT_SYMBOL(free_kvec); +EXPORT_SYMBOL(memcpy_to_kvec_dst); +EXPORT_SYMBOL(memcpy_from_kvec_dst); + /* dma handling */ EXPORT_SYMBOL(request_dma); EXPORT_SYMBOL(free_dma); diff -urNp x-ref/kernel/sched.c x/kernel/sched.c --- x-ref/kernel/sched.c 2003-02-14 05:22:26.000000000 +0100 +++ x/kernel/sched.c 2003-02-14 05:22:30.000000000 +0100 @@ -976,13 +976,22 @@ void force_cpu_reschedule(int cpu) */ static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync) { - struct list_head *tmp; + struct list_head *tmp, *next; unsigned int state; wait_queue_t *curr; task_t *p; + wait_queue_func_t func; - list_for_each(tmp, &q->task_list) { + list_for_each_safe(tmp, next, &q->task_list) { curr = list_entry(tmp, wait_queue_t, task_list); + func = curr->func; + if (func) { + unsigned int flags = curr->flags; + func(curr); + if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + continue; + } p = curr->task; state = p->state; if ((state & mode) && try_to_wake_up(p, sync) && diff -urNp x-ref/kernel/sysctl.c x/kernel/sysctl.c --- x-ref/kernel/sysctl.c 2003-02-14 05:22:26.000000000 +0100 +++ x/kernel/sysctl.c 2003-02-14 05:22:30.000000000 +0100 @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -305,6 +306,8 @@ static ctl_table proc_table[] = { {0} }; +extern int user_pinned_pages; + static ctl_table fs_table[] = { {FS_NRINODE, "inode-nr", &inodes_stat, 2*sizeof(int), 0444, NULL, &proc_dointvec}, @@ -328,6 +331,16 @@ static ctl_table fs_table[] = { sizeof(int), 0644, NULL, &proc_dointvec}, {FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {FS_AIO_NR, "aio-nr", &aio_nr, sizeof(aio_nr), + 0444, NULL, &proc_dointvec}, + {FS_AIO_MAX_NR, "aio-max-nr", &aio_max_nr, sizeof(aio_max_nr), + 0644, NULL, &proc_dointvec}, + {FS_AIO_MAX_SIZE, "aio-max-size", &aio_max_size, sizeof(aio_max_size), + 0644, NULL, &proc_dointvec}, + {FS_AIO_MAX_PINNED, "aio-max-pinned", &aio_max_pinned, sizeof(aio_max_pinned), + 0644, NULL, &proc_dointvec}, + {FS_AIO_MAX_PINNED+1, "aio-pinned", &user_pinned_pages, 4, + 0644, NULL, &proc_dointvec}, {0} }; diff -urNp x-ref/mm/Makefile x/mm/Makefile --- x-ref/mm/Makefile 2003-02-14 05:22:17.000000000 +0100 +++ x/mm/Makefile 2003-02-14 05:22:43.000000000 +0100 @@ -18,5 +18,6 @@ obj-y := memory.o mmap.o filemap.o mpro obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PROC_MM) += proc_mm.o +obj-y += wtd.o include $(TOPDIR)/Rules.make diff -urNp x-ref/mm/filemap.c x/mm/filemap.c --- x-ref/mm/filemap.c 2003-02-14 05:22:27.000000000 +0100 +++ x/mm/filemap.c 2003-02-14 05:22:30.000000000 +0100 @@ -29,6 +29,8 @@ #include #include +#include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -770,7 +772,7 @@ static inline wait_queue_head_t * wait_t * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -static inline wait_queue_head_t * page_waitqueue(struct page *page) +inline wait_queue_head_t * page_waitqueue(struct page *page) { pg_data_t * pgdat = page_zone(page)->zone_pgdat; return wait_table_hashfn(page, &pgdat->wait_table); @@ -1309,10 +1311,17 @@ void __do_generic_file_read(struct file int reada_ok; int error; int max_readahead = get_max_readahead(inode); + loff_t pos; + + pos = *ppos; + if (unlikely(pos < 0)) { + desc->error = -EINVAL; + return; + } cached_page = NULL; - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; /* * If the current position is outside the previous read-ahead window, @@ -1360,13 +1369,17 @@ void __do_generic_file_read(struct file end_index = i_size >> PAGE_CACHE_SHIFT; - if (index > end_index) + if (index > end_index) { + desc->error = 0; break; + } nr = PAGE_CACHE_SIZE; if (index == end_index) { nr = i_size & ~PAGE_CACHE_MASK; - if (nr <= offset) + if (nr <= offset) { + desc->error = 0; break; + } } nr = nr - offset; @@ -3209,3 +3222,698 @@ void __init page_cache_init(unsigned lon panic("Failed to allocate page hash table\n"); memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); } + +/* address_space_map + * Maps a series of pages from the page cache into the given array. + */ +static int address_space_map(struct address_space *as, unsigned long index, + int nr, struct page **pages, + int *nr_newp, struct page **new_pages) +{ + struct page *cached_page = NULL; + int nr_new = 0; + int ret; + + if (unlikely(nr <= 0)) { + *nr_newp = nr_new; + return 0; + } + + ret = 0; + + spin_lock(&pagecache_lock); + + while (nr > 0) { + struct page **hash = page_hash(as, index); + struct page *page; + + page = __find_page_nolock(as, index, *hash); + if (page) { + page_cache_get(page); +got_page: + pages[ret++] = page; + index++; + nr--; + continue; + } + + if (cached_page) { + __add_to_page_cache(cached_page, as, index, hash); + + spin_unlock(&pagecache_lock); + lru_cache_add(cached_page); + spin_lock(&pagecache_lock); + + nr_new++; + *new_pages++ = page = cached_page; + cached_page = NULL; + goto got_page; + } + spin_unlock(&pagecache_lock); + + cached_page = page_cache_alloc(as); + if (!cached_page) + goto out; + + /* Okay, we now have an allocated page. Retry + * the search and add. */ + spin_lock(&pagecache_lock); + } + + spin_unlock(&pagecache_lock); + +out: + if (cached_page) + page_cache_release(cached_page); + + *nr_newp = nr_new; + return ret ? ret : -ENOMEM; +} + +struct iodesc { + struct worktodo wtd; + + struct page *good_page; /* the highest Uptodate page */ + int good_idx; + int err; + int did_read; + int rw; + loff_t pos; + + struct page **pages; + struct page **new_pages; + struct page **cur_pagep; + int nr_pages; + int nr_new_pages; + + struct address_space *as; + struct file *file; + kvec_cb_t cb; + + size_t size; + unsigned long transferred; + unsigned offset; + struct kveclet *veclet; + + struct kvec_dst src; + + int sync; + unsigned long rlimit_fsize; + +#define READDESC_NR_DEF 3 + struct page *def_pages[READDESC_NR_DEF]; + struct page *def_new_pages[READDESC_NR_DEF]; +}; + +static void __iodesc_free(struct iodesc *io, int unlock) +{ + kvec_cb_t cb; + ssize_t res; + + if (unlock) { + unsigned i; + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + UnlockPage(page); + page_cache_release(page); + } + } else { + unsigned i; + for (i=0; inr_pages; i++) + page_cache_release(io->pages[i]); + } + + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); + if (io->pages != io->def_pages) + kfree(io->pages); + + cb = io->cb; + res = io->transferred ? io->transferred : io->err; + kfree(io); + + cb.fn(cb.data, cb.vec, res); +} + +/* By the time this function is called, all of the pages prior to + * the current good_idx have been released appropriately. The remaining + * duties are to release any remaining pages and to honour O_SYNC. + */ +static void __iodesc_finish_write(struct iodesc *io) +{ + pr_debug("__iodesc_finish_write(%p)\n", io); + + __iodesc_free(io, WRITE == io->rw); +} + +/* This is mostly ripped from generic_file_write */ +static int __iodesc_write_page(struct iodesc *io, struct page *page) +{ + char *kaddr = kmap(page); + unsigned long bytes; + unsigned long offset; + long status; + int done = 0; + + offset = io->offset; + kaddr += offset; + + bytes = PAGE_CACHE_SIZE - offset; + if (io->size < bytes) + bytes = io->size; + + pr_debug("__iodesc_write_page(%p (%lu), %lu %lu)\n", page, page->index, offset, bytes); + + io->err = io->as->a_ops->prepare_write(io->file, page, + offset, offset + bytes); + if (unlikely(io->err)) { + pr_debug("prepare_write: %d\n", io->err); + kunmap(page); + return 1; + } + + kvec_dst_map(&io->src); + memcpy_from_kvec_dst(kaddr, &io->src, bytes); + kvec_dst_unmap(&io->src); /* commit_write may block */ + + flush_dcache_page(page); + status = io->as->a_ops->commit_write(io->file, page, + offset, offset+bytes); + + /* We don't handle short writes */ + if (status > 0 && status != bytes) + done = 1; + + if (!status) + status = bytes; + + if (likely(status > 0)) { + io->transferred += status; + io->size -= status; + io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1); + + if (io->offset) + done = 1; + } else { + io->err = status; + done = 1; + } + + kunmap(page); + return done; +} + +void __iodesc_sync_wait_page(void *data) +{ + struct iodesc *io = data; + + do { + struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers; + + if (!head) + continue; + + bh = head; + do { + if (buffer_locked(bh)) { + pr_debug("waiting on bh=%pi io=%p\n", bh, io); + if (!wtd_wait_on_buffer(&io->wtd, bh)) + return; + } + if (buffer_req(bh) && !buffer_uptodate(bh)) { + pr_debug("io err bh=%p (%p)\n", bh, io); + io->err = -EIO; + break; + } + } while ((bh = bh->b_this_page) != head); + } while (!io->err && ++io->good_idx < io->nr_pages) ; + + pr_debug("finish_write(%p)\n", io); + __iodesc_finish_write(io); +} + +static void __iodesc_do_write(void *data) +{ + struct iodesc *io = data; + unsigned i; + + for (i=0; inr_pages; i++) { + if (__iodesc_write_page(io, io->pages[i])) + break; + } + + up(&io->file->f_dentry->d_inode->i_sem); + + if (io->sync) { + io->good_idx = 0; + + pr_debug("writing out pages(%p)\n", io); + for (i=0; inr_pages; i++) { + if (io->pages[i]->buffers) + writeout_one_page(io->pages[i]); + } + + pr_debug("calling __iodesc_sync_wait_page(%p)\n", io); + wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io); + __iodesc_sync_wait_page(io); + return; + } + + __iodesc_finish_write(io); +} + +static void __iodesc_write_lock_next_page(void *data) +{ + struct iodesc *io = data; + pr_debug("__iodesc_write_next_page(%p)\n", io); + + while (io->good_idx < io->nr_pages) { + io->good_page = io->pages[io->good_idx++]; + if (io->good_page == *io->cur_pagep) + io->cur_pagep++; + else { + if (!wtd_lock_page(&io->wtd, io->good_page)) + return; + } + } + + //Is this faster? __iodesc_do_write(io); + wtd_set_action(&io->wtd, __iodesc_do_write, io); + wtd_queue(&io->wtd); +} + +static void __generic_file_write_iodesc(struct iodesc *io) +{ + struct inode *inode = io->file->f_dentry->d_inode; + time_t now = CURRENT_TIME; + + remove_suid(inode); + if (inode->i_ctime != now || inode->i_mtime != now) { + inode->i_ctime = inode->i_mtime = now; + mark_inode_dirty_sync(inode); + } + + wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io); + io->sync = !!(io->file->f_flags & O_SYNC); + io->good_idx = 0; + io->cur_pagep = io->new_pages; + __iodesc_write_lock_next_page(io); +} + +static void __iodesc_read_finish(struct iodesc *io) +{ + struct page **src_pagep; + char *dst_addr, *src_addr; + int src_off; + size_t size; + size_t valid; + + struct kveclet *veclet = io->veclet; + struct page *dst_page = veclet->page; + int dst_len = veclet->length; + int dst_off = veclet->offset; + + + pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx); + if (io->good_idx <= 0) + goto no_data; + + size = io->size; + src_off = io->offset; + src_pagep = io->pages; + src_addr = kmap(*src_pagep); + + valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT; + valid -= src_off; + pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off); + + if (valid < size) + size = valid; + + dst_addr = kmap(veclet->page); + + while (size > 0) { + int this = PAGE_CACHE_SIZE - src_off; + if ((PAGE_SIZE - dst_off) < this) + this = PAGE_SIZE - dst_off; + if (size < this) + this = size; + pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n", + this, src_off, dst_off, dst_len); + memcpy(dst_addr + dst_off, src_addr + src_off, this); + + src_off += this; + dst_off += this; + dst_len -= this; + size -= this; + io->transferred += this; + pr_debug("read_finish: this=%d transferred=%d\n", + this, io->transferred); + + if (size <= 0) + break; + + if (dst_len <= 0) { + kunmap(dst_page); + veclet++; + dst_page = veclet->page; + dst_off = veclet->offset; + dst_len = veclet->length; + dst_addr = kmap(dst_page); + } + + if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */ + kunmap(*src_pagep); + pr_debug("page(%lu)->count = %d\n", + (*src_pagep)->index, + atomic_read(&(*src_pagep)->count)); + src_pagep++; + src_addr = kmap(*src_pagep); + src_off = 0; + } + } + kunmap(dst_page); + kunmap(*src_pagep); +no_data: + __iodesc_free(io, 0); +} + +static void __iodesc_make_uptodate(void *data) +{ + struct iodesc *io = data; + struct page *page = io->good_page; + int locked = 1; + + pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index); +again: + while (Page_Uptodate(page)) { + pr_debug("page index %lu uptodate\n", page->index); + if (locked) { + UnlockPage(page); + locked = 0; + } + io->did_read = 0; + io->good_idx++; + if (io->good_idx >= io->nr_pages) { + __iodesc_read_finish(io); + return; + } + page = io->good_page = io->pages[io->good_idx]; + pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index); + } + + if (!locked) { + if (!wtd_lock_page(&io->wtd, page)) + return; + locked = 1; + } + + if (!io->did_read) { + /* We haven't tried reading this page before, give it a go. */ + pr_debug("attempting to read %lu\n", page->index); + io->did_read = 1; + if (likely(page->mapping)) { + locked = 0; + io->err = page->mapping->a_ops->readpage(io->file, page); + if (!io->err) { + if (Page_Uptodate(page)) + goto again; + if (wtd_lock_page(&io->wtd, page)) { + locked = 1; + goto again; + } + return; + } + } else + /* page not mapped, erroring out. */ + io->err = 0; + } + + if (locked) + UnlockPage(page); + + /* We've already read this page before. Set err to EIO and quite */ + if (!io->err) + io->err = -EIO; + __iodesc_read_finish(io); +} + +static void __wtdgeneric_file_read_iodesc(void *data); + +static void __generic_file_read_iodesc(struct iodesc *io, int mayblock) +{ + int (*readpage)(struct file *, struct page *); + int i; + + wtd_set_action(&io->wtd, __iodesc_make_uptodate, io); + readpage = io->as->a_ops->readpage; + for (i=0; inr_new_pages; i++) { + int ret; + if (!mayblock) { + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + ret = readpage(io->file, io->new_pages[i]); + if (ret) + printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret); + } + + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + if (Page_Uptodate(page)) { + pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index); + continue; + } + + if (!mayblock) { + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); + return; + } + /* + * Lock the page and if it is still mapped in the file, + * attempt to read it in. + */ + if (!TryLockPage(page)) { + if (likely(page->mapping)) { + int ret = readpage(io->file, page); + if (ret) + pr_debug("__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret); + } else + /* page not mapped, truncated! */ + unlock_page(page); + } + + if (!Page_Uptodate(page) && io->good_idx == -1) { + pr_debug("first good_idx=%d (%lu)\n", i, page->index); + io->good_idx = i; + io->good_page = page; + } + } + + /* Whee, all the pages are uptodate! */ + if (!io->good_page) { + pr_debug("all pages uptodate!\n"); + io->good_idx = io->nr_pages; + __iodesc_read_finish(io); + return; + } + + pr_debug("locking good_page\n"); + if (wtd_lock_page(&io->wtd, io->good_page)) + __iodesc_make_uptodate(io); + return; +} + +static void __wtdgeneric_file_read_iodesc(void *data) +{ + struct iodesc *io = data; + __generic_file_read_iodesc(io, 1); +} + +static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos); + +int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, READ, cb, size, pos); +} + +int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) +{ + return generic_file_rw_kvec(file, WRITE, cb, size, pos); +} + +void wtd_rw_kvec_core(void *); +int rw_kvec_core(struct iodesc *io); + +int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb, + size_t size, loff_t pos) +{ + struct inode *inode = file->f_dentry->d_inode; + int append = file->f_flags & O_APPEND; + struct iodesc *io = NULL; + int ret; + + ret = -EINVAL; + if (unlikely(rw != READ && rw != WRITE)) + goto out; + + /* Don't check pos when appending, but otherwise do santity + * checks before allocating memory. -'ve offsets are invalid. + */ + if (unlikely(!append && pos < 0)) + goto out; + + ret = -ENOMEM; + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + goto out; + + memset(io, 0, sizeof(*io)); + io->file = file; + io->rw = rw; + io->cb = cb; + io->size = size; + io->pos = pos; + io->rlimit_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur; + wtd_set_action(&io->wtd, wtd_rw_kvec_core, io); + + if ((rw == READ) || (0 == wtd_down(&io->wtd, &inode->i_sem))) + return rw_kvec_core(io); + + return 0; + +out: + if (!ret) + cb.fn(cb.data, cb.vec, ret); + return ret; +} + +void wtd_rw_kvec_core(void *data) +{ + struct iodesc *io = data; + kvec_cb_t cb = io->cb; + int ret = rw_kvec_core(io); + if (ret) + cb.fn(cb.data, cb.vec, ret); +} + +int rw_kvec_core(struct iodesc *io) +{ + int append = io->file->f_flags & O_APPEND; + struct inode *inode = io->file->f_dentry->d_inode; + struct address_space *as = inode->i_mapping; + unsigned long index; + unsigned long eindex; + unsigned long nr_pages; + int ret; + + if (io->rw == WRITE) { + unsigned long long tmp; + loff_t limit; + + /* We've already down'd the inode semaphore */ + if (append) + io->pos = inode->i_size; + + limit = io->rlimit_fsize; + if (likely(RLIM_INFINITY == limit)) + limit = OFFSET_MAX; + + /* Filesystem limits take precedence over user limits */ + if (likely(inode->i_sb->s_maxbytes < limit)) + limit = inode->i_sb->s_maxbytes; + + if (unlikely(io->pos >= limit)) { + pr_debug("maxbytes: %Ld\n", limit); + ret = 0; + if (io->size || io->pos > limit) + ret = -EFBIG; + goto out_io; + } + + /* Clamp writes straddling limit. */ + tmp = io->pos + io->size; + if (unlikely(tmp > (unsigned long long)limit)) + io->size = limit - io->pos; + } + + if (READ == io->rw) { + pr_debug("io->pos=%Ld i_size=%Ld\n", io->pos, inode->i_size); + + if (io->pos > inode->i_size) + io->size = 0; + else if ((io->pos + io->size) > inode->i_size) { + size_t size = inode->i_size - io->pos; + if (size < io->size) + io->size = size; + } + + pr_debug("io->size=%d\n", io->size); + } + + ret = 0; + if (unlikely(!io->size)) + goto out_io; + + index = io->pos >> PAGE_CACHE_SHIFT; + eindex = (io->pos + io->size - 1) >> PAGE_CACHE_SHIFT; + nr_pages = eindex - index + 1; + + pr_debug("nr_pages: %lu\n", nr_pages); + + io->good_idx = -1; + io->good_page = NULL; + io->did_read = 0; + io->err = 0; + io->as = as; + io->offset = (unsigned long)io->pos & (PAGE_CACHE_SIZE - 1); + kvec_dst_init(&io->src); + kvec_dst_set(&io->src, io->cb.vec->veclet); + io->veclet = io->cb.vec->veclet; + if (nr_pages < READDESC_NR_DEF) { + io->pages = io->def_pages; + io->new_pages = io->def_new_pages; + } else { + io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->pages) + goto out_io; + + io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->new_pages) + goto out_pages; + } + + ret = address_space_map(as, index, nr_pages, io->pages, + &io->nr_new_pages, io->new_pages); + pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages); + if (ret <= 0) + goto out_new_pages; + + io->nr_pages = ret; + io->pages[io->nr_pages] = NULL; + io->new_pages[io->nr_new_pages] = NULL; + + if (io->rw == READ) + __generic_file_read_iodesc(io, 0); + else if (io->rw == WRITE) + __generic_file_write_iodesc(io); + + return 0; + +out_new_pages: + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); +out_pages: + if (io->pages != io->def_pages) + kfree(io->pages); +out_io: + if (io->rw == WRITE) + up(&inode->i_sem); + if (!ret) + io->cb.fn(io->cb.data, io->cb.vec, ret); + kfree(io); + return ret; +} diff -urNp x-ref/mm/memory.c x/mm/memory.c --- x-ref/mm/memory.c 2003-02-14 05:22:27.000000000 +0100 +++ x/mm/memory.c 2003-02-14 05:22:30.000000000 +0100 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -447,9 +448,11 @@ static struct page * follow_page(struct pte = *ptep; pte_kunmap(ptep); if (pte_present(pte)) { + struct page * page = pte_page(pte); + prefetch(page); if (!write || (pte_write(pte) && pte_dirty(pte))) - return pte_page(pte); + return page; } if (pte_none(pte)) none = 1; @@ -1599,3 +1602,233 @@ struct page * vmalloc_to_page(void * vma } return page; } + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + * FIXME: some architectures need to flush the cache based on user addresses + * here. Someone please provide a better macro than flush_cache_page. + */ + +#define dprintk(x...) +atomic_t user_pinned_pages = ATOMIC_INIT(0); + +struct kvec *map_user_kvec(int rw, unsigned long ptr, size_t len) +{ + return mm_map_user_kvec(current->mm, rw, ptr, len); +} + +struct kvec *mm_map_user_kvec(struct mm_struct *mm, int rw, unsigned long ptr, + size_t len) +{ + struct kvec *vec; + struct kveclet *veclet; + unsigned long end; + int err; + int i; + int datain = (rw == READ); + int nr_pages; + + end = ptr + len; + if (unlikely(end < ptr)) + return ERR_PTR(-EINVAL); + + nr_pages = (ptr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages -= ptr >> PAGE_SHIFT; + nr_pages ++; + + atomic_add(nr_pages, &user_pinned_pages); + err = -EAGAIN; + if (unlikely(atomic_read(&user_pinned_pages) >= aio_max_pinned)) + goto out_adjust; + + vec = kmalloc(sizeof(struct kvec) + nr_pages * sizeof(struct kveclet), + GFP_KERNEL); + err = -ENOMEM; + if (unlikely(!vec)) + goto out_adjust; + + vec->nr = 0; + vec->max_nr = nr_pages; + veclet = vec->veclet; + + /* Make sure the iobuf is not already mapped somewhere. */ + dprintk ("map_user_kiobuf: begin\n"); + + down_read(&mm->mmap_sem); + + err = -EFAULT; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + veclet->offset = ptr & ~PAGE_MASK; + veclet->length = PAGE_SIZE - veclet->offset; + if (len < veclet->length) + veclet->length = len; + ptr &= PAGE_MASK; + len -= veclet->length; + + err = get_user_pages(current, mm, ptr, 1, + datain, 0, &veclet->page, NULL); + if (unlikely(err < 0)) + goto out_unlock; + + veclet++; + ptr += PAGE_SIZE; + vec->nr = ++i; + } + + veclet->page = NULL; /* dummy for the prefetch in free_kvec */ + veclet->length = 0; /* bug checking ;-) */ + + up_read(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return vec; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kvec(vec, 0); + kfree(vec); + dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw); + return ERR_PTR(err); + + out_adjust: + atomic_sub(nr_pages, &user_pinned_pages); + dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw); + return ERR_PTR(err); +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kvec (struct kvec *vec, int dirtied) +{ + struct kveclet *veclet = vec->veclet; + struct kveclet *end = vec->veclet + vec->nr; + struct page *map = veclet->page; + + prefetchw(map); + for (; vecletpage) { + prefetchw(veclet[1].page); + if (likely(map != NULL) && !PageReserved(map)) { + if (dirtied) { + SetPageDirty(map); + flush_dcache_page(map); /* FIXME */ + } + __free_page(map); + } + } + + atomic_sub(vec->max_nr, &user_pinned_pages); + vec->nr = 0; +} + +void free_kvec(struct kvec *vec) +{ + if (unlikely(vec->nr)) + BUG(); + kfree(vec); +} + +/* kvec memory copy helper: appends len bytes in from to dst. + */ +void memcpy_to_kvec_dst(struct kvec_dst *dst, const char *from, long len) +{ + if (unlikely(len < 0)) + BUG(); + do { + int cnt = len; + if (dst->space < cnt) + cnt = dst->space; + + memcpy(dst->dst, from, cnt); + from += cnt; + dst->space -= cnt; + dst->dst += cnt; + len -= cnt; + if (!dst->space && len) { + kvec_dst_unmap(dst); + dst->let++; + dst->offset = 0; + kvec_dst_map(dst); + if (unlikely(!dst->space)) + BUG(); + } + } while (len); +} + +/* kvec memory copy helper: copies and consumes len bytes in from to dst. + */ +void memcpy_from_kvec_dst(char *to, struct kvec_dst *from, long len) +{ + if (unlikely(len < 0)) + BUG(); + do { + int cnt = len; + if (from->space < cnt) + cnt = from->space; + + memcpy(to, from->dst, cnt); + to += cnt; + from->space -= cnt; + from->dst += cnt; + len -= cnt; + if (unlikely(!from->space && len)) { + kvec_dst_unmap(from); + from->let++; + from->offset = 0; + kvec_dst_map(from); + if (unlikely(!from->space)) + BUG(); + } + } while (len); +} + +/* + */ +int copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len) +{ + struct kveclet *let = to->veclet; + int ret = 0; + + if ((ssize_t)len < 0) + BUG(); + + while (offset) { + if (offset < let->length) + break; + offset -= let->length; + let++; + + if ((let - to->veclet) > to->nr) + BUG(); + } + + /* FIXME: kmap deadlockage */ + while (len && !ret) { + char *dst = kmap(let->page); + size_t this; + + this = let->length - offset; + if (len < this) + this = len; + + offset += let->offset; + if (copy_from_user(dst+offset, from, this)) + ret = -EFAULT; + + from += this; + len -= this; + kunmap(let->page); + offset = 0; + let ++; + } + + return ret; +} diff -urNp x-ref/mm/wtd.c x/mm/wtd.c --- x-ref/mm/wtd.c 1970-01-01 01:00:00.000000000 +0100 +++ x/mm/wtd.c 2003-02-14 05:22:30.000000000 +0100 @@ -0,0 +1,77 @@ +#include +#include +#include +#include + +static void __wtd_lock_page_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct page *page = (struct page *)wtd->data; + + if (!TryLockPage(page)) { + __remove_wait_queue(page_waitqueue(page), &wtd->wait); + wtd_queue(wtd); + } else + async_run_tq_disk(); +} + +int wtd_lock_page(struct worktodo *wtd, struct page *page) +{ + if (TryLockPage(page)) { + wtd->data = page; + init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter); + + /* Wakeups may race with TryLockPage, so try again within the wait + * queue spinlock. + */ + if (!add_wait_queue_cond(page_waitqueue(page), &wtd->wait, + TryLockPage(page))) { + /* Page is still locked. Kick the disk queue... */ + run_task_queue(&tq_disk); + return 0; + } + } + + return 1; +} + +static void __wtd_bh_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct buffer_head *bh = (struct buffer_head *)wtd->data; + + if (!buffer_locked(bh)) { + __remove_wait_queue(&bh->b_wait, &wtd->wait); + wtd_queue(wtd); + } else + async_run_tq_disk(); +} + +int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh) +{ + if (!buffer_locked(bh)) { + return 1; + } + wtd->data = bh; + init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter); + if (add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh))) + return 1; + run_task_queue(&tq_disk); + return 0; +} + +static void do_run_tq_disk(void *data) +{ + run_task_queue(&tq_disk); +} + +static struct tq_struct run_disk_tq = { + .routine = do_run_tq_disk, +}; + +void async_run_tq_disk(void) +{ + mb(); /* going to read tq_disk locklessy */ + if (TQ_ACTIVE(tq_disk)) + schedule_task(&run_disk_tq); +}