diff -urNp x-ref/arch/i386/config.in x/arch/i386/config.in --- x-ref/arch/i386/config.in 2003-01-14 02:37:48.000000000 +0100 +++ x/arch/i386/config.in 2003-01-14 02:37:51.000000000 +0100 @@ -202,11 +202,13 @@ else fi if [ "$CONFIG_HIGHMEM64G" = "y" ]; then define_bool CONFIG_X86_PAE y + define_int CONFIG_FORCE_MAX_ZONEORDER 10 choice 'User address space size' \ "3GB CONFIG_1GB \ 2GB CONFIG_2GB \ 1GB CONFIG_3GB" 3GB else + define_int CONFIG_FORCE_MAX_ZONEORDER 11 choice 'User address space size' \ "3GB CONFIG_1GB \ 2GB CONFIG_2GB \ diff -urNp x-ref/fs/proc/array.c x/fs/proc/array.c --- x-ref/fs/proc/array.c 2003-01-14 02:37:34.000000000 +0100 +++ x/fs/proc/array.c 2003-01-14 02:37:51.000000000 +0100 @@ -403,6 +403,13 @@ static inline void statm_pte_range(pmd_t if (pmd_none(*pmd)) return; + if (pmd_bigpage(*pmd)) { + *total += BIGPAGE_PAGES; + *pages += BIGPAGE_PAGES; + *shared += BIGPAGE_PAGES; + *dirty += BIGPAGE_PAGES; + return; + } if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); diff -urNp x-ref/fs/proc/proc_misc.c x/fs/proc/proc_misc.c --- x-ref/fs/proc/proc_misc.c 2003-01-14 02:37:48.000000000 +0100 +++ x/fs/proc/proc_misc.c 2003-01-14 02:37:51.000000000 +0100 @@ -191,7 +191,8 @@ static int meminfo_read_proc(char *page, "LowTotal: %8lu kB\n" "LowFree: %8lu kB\n" "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n", + "SwapFree: %8lu kB\n" + "BigFree: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.sharedram), @@ -205,7 +206,8 @@ static int meminfo_read_proc(char *page, K(i.totalram-i.totalhigh), K(i.freeram-i.freehigh), K(i.totalswap), - K(i.freeswap)); + K(i.freeswap), + nr_bigpages << (PMD_SHIFT-10)); #ifdef CONFIG_DISCONTIGMEM for (nid = 0; nid < numnodes; ++nid) { diff -urNp x-ref/include/asm-i386/pgalloc.h x/include/asm-i386/pgalloc.h --- x-ref/include/asm-i386/pgalloc.h 2003-01-14 02:37:34.000000000 +0100 +++ x/include/asm-i386/pgalloc.h 2003-01-14 02:37:51.000000000 +0100 @@ -12,6 +12,7 @@ #define pgtable_cache_size (current_cpu_data.pgtable_cache_sz) #define pmd_populate(mm, pmd, page) set_pmd(pmd, mk_pmd(page, __pgprot(_PAGE_TABLE))) +#define pmd_populate_bigpage(mm, pmd, page) set_pmd(pmd, mk_pmd(page, __pgprot(_PAGE_TABLE + _PAGE_PSE))) /* * Allocate and free page tables. diff -urNp x-ref/include/asm-i386/pgtable.h x/include/asm-i386/pgtable.h --- x-ref/include/asm-i386/pgtable.h 2003-01-14 02:37:48.000000000 +0100 +++ x/include/asm-i386/pgtable.h 2003-01-14 02:37:51.000000000 +0100 @@ -272,7 +272,14 @@ extern unsigned long pg0[1024]; #define pmd_present(x) ((x).pmd_low & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_bad(x) (((x).pmd_low & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_bigpage(x) (pmd_val(x) & _PAGE_PSE) +#define BIGPAGE_SHIFT (PMD_SHIFT) +#define BIGPAGE_SIZE (1UL << BIGPAGE_SHIFT) +#define BIGPAGE_MASK (BIGPAGE_SIZE - 1) +#define BIGPAGE_PAGES (BIGPAGE_SIZE / PAGE_SIZE) + +#define HAVE_ARCH_BIGPAGES cpu_has_pse #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) diff -urNp x-ref/include/linux/fs.h x/include/linux/fs.h --- x-ref/include/linux/fs.h 2003-01-14 02:37:49.000000000 +0100 +++ x/include/linux/fs.h 2003-01-14 02:37:51.000000000 +0100 @@ -961,6 +961,8 @@ struct file_operations { ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb *); ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb *); ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb *); + + int (*munmap) (struct vm_area_struct *, unsigned long, size_t); }; struct inode_operations { diff -urNp x-ref/include/linux/mm.h x/include/linux/mm.h --- x-ref/include/linux/mm.h 2003-01-14 02:37:49.000000000 +0100 +++ x/include/linux/mm.h 2003-01-14 02:37:51.000000000 +0100 @@ -103,6 +103,7 @@ struct vm_area_struct { #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_BIGPAGE 0x00100000 /* bigpage mappings, no pte's */ #define VM_STACK_FLAGS 0x00000177 @@ -299,6 +300,7 @@ typedef struct page { #define PG_reserved 14 #define PG_launder 15 /* written out by VM pressure.. */ #define PG_fs_1 16 /* Filesystem specific */ +#define PG_bigpage 17 /* Make it prettier to test the above... */ #define UnlockPage(page) unlock_page(page) @@ -313,6 +315,7 @@ typedef struct page { #define TryLockPage(page) test_and_set_bit(PG_locked, &(page)->flags) #define PageChecked(page) test_bit(PG_checked, &(page)->flags) #define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) +#define BigPage(page) test_bit(PG_bigpage, &(page)->flags) #define PageLaunder(page) test_bit(PG_launder, &(page)->flags) #define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) @@ -458,6 +461,10 @@ extern unsigned long FASTCALL(get_zeroed */ #define get_free_page get_zeroed_page +extern unsigned long nr_bigpages; +extern struct page * FASTCALL(alloc_bigpage(void)); +extern void FASTCALL(free_bigpage(struct page *page)); + /* * There is only one 'core' page-freeing function. */ @@ -715,6 +722,15 @@ extern struct vm_area_struct *find_exten extern struct page * vmalloc_to_page(void *addr); +#ifndef HAVE_ARCH_BIGPAGES +#define pmd_bigpage(x) (0) +#define BIGPAGE_MASK (0) +#define BIGPAGE_SIZE (0) +#define BIGPAGE_SHIFT (0) +#define BIGPAGE_PAGES (1) +#define pmd_populate_bigpage(x,y,z) out_of_line_bug() +#endif /* HAVE_ARCH_BIGPAGES */ + #endif /* __KERNEL__ */ #endif diff -urNp x-ref/include/linux/shmem_fs.h x/include/linux/shmem_fs.h --- x-ref/include/linux/shmem_fs.h 2002-01-22 18:56:29.000000000 +0100 +++ x/include/linux/shmem_fs.h 2003-01-14 02:37:51.000000000 +0100 @@ -29,6 +29,8 @@ struct shmem_inode_info { int locked; /* into memory */ struct list_head list; struct inode *inode; + unsigned long max_bigpages; + struct page **bigpages; }; struct shmem_sb_info { @@ -40,5 +42,16 @@ struct shmem_sb_info { }; #define SHMEM_I(inode) (&inode->u.shmem_i) +#define I_BIGPAGE(inode) (SHMEM_I(inode)->bigpages) + +/* + * Limit kmalloc() size. + */ +#define MAX_BIGPAGES (32768/sizeof(struct page *)) + +extern int shm_use_bigpages; +extern void shm_enable_bigpages(struct inode *inode); +extern int shmem_make_bigpage_mmap(struct file * file, struct vm_area_struct * vma); +extern int shmem_munmap(struct vm_area_struct * vma, unsigned long addr, size_t size); #endif diff -urNp x-ref/include/linux/sysctl.h x/include/linux/sysctl.h --- x-ref/include/linux/sysctl.h 2003-01-14 02:37:49.000000000 +0100 +++ x/include/linux/sysctl.h 2003-01-14 02:37:51.000000000 +0100 @@ -124,6 +124,7 @@ enum KERN_CORE_USES_PID=52, /* int: use core or core.%pid */ KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */ + KERN_SHMUSEBIGPAGES=55, /* int: use bigpages wherever possible */ KERN_CORE_PATTERN=56, /* string: pattern for core-files */ }; diff -urNp x-ref/ipc/shm.c x/ipc/shm.c --- x-ref/ipc/shm.c 2003-01-14 02:37:48.000000000 +0100 +++ x/ipc/shm.c 2003-01-14 02:37:51.000000000 +0100 @@ -159,15 +159,24 @@ static void shm_close (struct vm_area_st static int shm_mmap(struct file * file, struct vm_area_struct * vma) { + int error = 0; + UPDATE_ATIME(file->f_dentry->d_inode); + if (SHMEM_I(file->f_dentry->d_inode)->bigpages) { + error = shmem_make_bigpage_mmap(file, vma); + if (error) + goto out; + } vma->vm_ops = &shm_vm_ops; vma->vm_flags &= ~VM_IO; shm_inc(file->f_dentry->d_inode->i_ino); - return 0; + out: + return error; } static struct file_operations shm_file_operations = { - mmap: shm_mmap + mmap: shm_mmap, + munmap: shmem_munmap, }; static struct vm_operations_struct shm_vm_ops = { @@ -218,6 +227,12 @@ static int newseg (key_t key, int shmflg file->f_op = &shm_file_operations; shm_tot += numpages; shm_unlock (id); + + if (shm_use_bigpages && + !(size & BIGPAGE_MASK) && + ((size >> BIGPAGE_SHIFT) < MAX_BIGPAGES)) + shm_enable_bigpages(file->f_dentry->d_inode); + return shp->id; no_id: diff -urNp x-ref/kernel/fork.c x/kernel/fork.c --- x-ref/kernel/fork.c 2003-01-14 02:37:49.000000000 +0100 +++ x/kernel/fork.c 2003-01-14 02:37:51.000000000 +0100 @@ -235,7 +235,8 @@ static inline int dup_mmap(struct mm_str if (!tmp) goto fail_nomem; *tmp = *mpnt; - tmp->vm_flags &= ~VM_LOCKED; + if (!(tmp->vm_flags & VM_BIGPAGE)) + tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; file = tmp->vm_file; diff -urNp x-ref/kernel/sysctl.c x/kernel/sysctl.c --- x-ref/kernel/sysctl.c 2003-01-14 02:37:49.000000000 +0100 +++ x/kernel/sysctl.c 2003-01-14 02:37:51.000000000 +0100 @@ -223,6 +223,8 @@ static ctl_table kern_table[] = { 0444, NULL, &proc_dointvec}, {KERN_RTSIGMAX, "rtsig-max", &max_queued_signals, sizeof(int), 0644, NULL, &proc_dointvec}, + {KERN_SHMUSEBIGPAGES, "shm-use-bigpages", &shm_use_bigpages, sizeof(int), + 0644, NULL, &proc_dointvec}, #ifdef CONFIG_SYSVIPC {KERN_SHMMAX, "shmmax", &shm_ctlmax, sizeof (size_t), 0644, NULL, &proc_doulongvec_minmax}, diff -urNp x-ref/mm/Makefile x/mm/Makefile --- x-ref/mm/Makefile 2003-01-14 02:37:49.000000000 +0100 +++ x/mm/Makefile 2003-01-14 02:37:51.000000000 +0100 @@ -14,7 +14,7 @@ export-objs := shmem.o filemap.o memory. obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o bigpages.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-y += wtd.o diff -urNp x-ref/mm/bigpages.c x/mm/bigpages.c --- x-ref/mm/bigpages.c 1970-01-01 01:00:00.000000000 +0100 +++ x/mm/bigpages.c 2003-01-14 02:37:51.000000000 +0100 @@ -0,0 +1,97 @@ +/* + * linux/mm/bigpages.c + * + * Copyright (C) 2002 Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static spinlock_t bigpages_lock = SPIN_LOCK_UNLOCKED; +unsigned long nr_bigpages; +static LIST_HEAD(bigpages_list); + +#define ORDER_BIGPAGE (PMD_SHIFT - PAGE_SHIFT) + +struct page *alloc_bigpage(void) +{ + list_t *head = &bigpages_list; + struct page *page = NULL; + + spin_lock(&bigpages_lock); + if (nr_bigpages) { + page = list_entry(head->next, struct page, list); + list_del_init(head->next); + nr_bigpages--; + } + spin_unlock(&bigpages_lock); + + return page; +} + +void free_bigpage(struct page *page) +{ + struct page *p; + int i; + +#ifndef CONFIG_DISCONTIGMEM + BUG_ON((page - mem_map) % BIGPAGE_PAGES); +#endif + for (i = 0 ; i < (1 << ORDER_BIGPAGE); i++) { + p = page + i; + set_page_count(p, 2); + set_bit(PG_bigpage, &p->flags); + clear_highpage(p); + } + spin_lock(&bigpages_lock); + nr_bigpages++; + list_add(&page->list, &bigpages_list); + spin_unlock(&bigpages_lock); +} + +static int grow_bigpages_pool(int pages) +{ + struct page *page; + int allocated = 0; + + while (pages) { + page = alloc_pages(__GFP_HIGHMEM, ORDER_BIGPAGE); + if (!page) + break; + free_bigpage(page); + pages--; + allocated++; + } + printk("bigpage subsystem: allocated %ld bigpages (=%ldMB).\n", + nr_bigpages, nr_bigpages << (BIGPAGE_SHIFT - 20)); + return allocated; +} + +static __initdata int boot_bigpages; + +static __init int reserve_bigpages(char *str) +{ + unsigned long pages = memparse(str, &str) >> PAGE_SHIFT; + + pages >>= ORDER_BIGPAGE; + boot_bigpages = pages; + + return 0; +} + +static __init int init_bigpage_pool(void) +{ + grow_bigpages_pool(boot_bigpages); + return 0; +} + +__setup("bigpages=", reserve_bigpages); +__initcall(init_bigpage_pool); + diff -urNp x-ref/mm/memory.c x/mm/memory.c --- x-ref/mm/memory.c 2003-01-14 02:37:49.000000000 +0100 +++ x/mm/memory.c 2003-01-14 02:37:51.000000000 +0100 @@ -97,6 +97,8 @@ static inline void free_one_pmd(pmd_t * if (pmd_none(*dir)) return; + if (pmd_bigpage(*dir)) + return; if (pmd_bad(*dir)) { pmd_ERROR(*dir); pmd_clear(dir); @@ -194,6 +196,7 @@ int copy_page_range(struct mm_struct *ds unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + int bigpage = vma->vm_flags & VM_BIGPAGE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -228,6 +231,17 @@ skip_copy_pmd_range: address = (address if (pmd_none(*src_pmd)) goto skip_copy_pte_range; + if (bigpage) { + if (!pmd_bigpage(*src_pmd)) + pmd_clear(dst_pmd); + else + *dst_pmd = *src_pmd; + address += PMD_SIZE; + if (address >= end) + goto out; + goto cont_copy_pmd_range; + } + BUG_ON(pmd_bigpage(*src_pmd)); if (pmd_bad(*src_pmd)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); @@ -319,6 +333,7 @@ static inline int zap_pte_range(mmu_gath if (pmd_none(*pmd)) return 0; + BUG_ON(pmd_bigpage(*pmd)); if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); @@ -368,7 +383,10 @@ static inline int zap_pmd_range(mmu_gath end = ((address + PGDIR_SIZE) & PGDIR_MASK); freed = 0; do { - freed += zap_pte_range(tlb, pmd, address, end - address); + if (pmd_bigpage(*pmd)) + pmd_clear(pmd); + else + freed += zap_pte_range(tlb, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); @@ -439,6 +457,8 @@ static struct page * follow_page(struct pmd = pmd_offset(pgd, address); if (pmd_none(*pmd)) goto out; + if (pmd_bigpage(*pmd)) + return __pmd_page(*pmd) + (address & BIGPAGE_MASK) / PAGE_SIZE; if (pmd_bad(*pmd)) BUG(); @@ -1429,6 +1449,40 @@ static inline int handle_pte_fault(struc return 1; } +static int handle_bigpage_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long long address, int write_access, pmd_t *pmd) +{ + struct page *new_page; + pte_t dummy; + + spin_unlock(&mm->page_table_lock); + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + + if (!new_page) + return 0; + if (new_page == NOPAGE_OOM) + return -1; + BUG_ON(!BigPage(new_page)); + spin_lock(&mm->page_table_lock); + /* + * Another context was faster. + */ + if (pmd_present(*pmd)) { + if (pmd_bigpage(*pmd)) { + spin_unlock(&mm->page_table_lock); + return 1; + } + free_one_pmd(pmd); + } + /* + * Major fault. + */ + pmd_populate_bigpage(mm, pmd, new_page); + spin_unlock(&mm->page_table_lock); + flush_tlb_page(vma, address & PMD_MASK); + update_mmu_cache(vma, address, dummy); + return 2; +} + /* * By the time we get here, we already hold the mm semaphore */ @@ -1437,6 +1491,7 @@ int handle_mm_fault(struct mm_struct *mm { pgd_t *pgd; pmd_t *pmd; + int bigpage = vma->vm_flags & VM_BIGPAGE; current->state = TASK_RUNNING; pgd = pgd_offset(mm, address); @@ -1448,11 +1503,13 @@ int handle_mm_fault(struct mm_struct *mm spin_lock(&mm->page_table_lock); pmd = pmd_alloc(mm, pgd, address); - if (pmd) { + if (pmd && !bigpage) { pte_t * pte = pte_alloc_atomic(mm, pmd, address); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte, pmd); - } + } else + if (pmd) + return handle_bigpage_fault(mm, vma, address, write_access, pmd); spin_unlock(&mm->page_table_lock); return -1; } @@ -1589,6 +1646,7 @@ struct page * vmalloc_to_page(void * vma if (!pgd_none(*pgd)) { pmd = pmd_offset(pgd, addr); if (!pmd_none(*pmd)) { + BUG_ON(pmd_bigpage(*pmd)); ptep = pte_offset_atomic(pmd, addr); pte = *ptep; pte_kunmap(ptep); diff -urNp x-ref/mm/mlock.c x/mm/mlock.c --- x-ref/mm/mlock.c 2002-01-22 18:53:56.000000000 +0100 +++ x/mm/mlock.c 2003-01-14 02:37:51.000000000 +0100 @@ -162,6 +162,8 @@ static int do_mlock(unsigned long start, vma = find_vma(current->mm, start); if (!vma || vma->vm_start > start) return -ENOMEM; + if (vma->vm_flags & VM_BIGPAGE) + return -EINVAL; for (nstart = start ; ; ) { unsigned int newflags; diff -urNp x-ref/mm/mmap.c x/mm/mmap.c --- x-ref/mm/mmap.c 2003-01-14 02:37:32.000000000 +0100 +++ x/mm/mmap.c 2003-01-14 02:37:51.000000000 +0100 @@ -988,6 +988,11 @@ int do_munmap(struct mm_struct *mm, unsi && mm->map_count >= max_map_count) return -ENOMEM; + if (mpnt->vm_file && mpnt->vm_file->f_op && + mpnt->vm_file->f_op->munmap && + mpnt->vm_file->f_op->munmap(mpnt, addr, len)) + return -EINVAL; + /* * We may need one additional vma to fix up the mappings ... * and this is the last chance for an easy error exit. diff -urNp x-ref/mm/mprotect.c x/mm/mprotect.c --- x-ref/mm/mprotect.c 2003-01-14 02:37:42.000000000 +0100 +++ x/mm/mprotect.c 2003-01-14 02:37:51.000000000 +0100 @@ -290,6 +290,10 @@ asmlinkage long sys_mprotect(unsigned lo if (!vma || vma->vm_start > start) goto out; + error = 0; + if (vma->vm_flags & VM_BIGPAGE) + goto out; + for (nstart = start ; ; ) { unsigned int newflags; int last = 0; diff -urNp x-ref/mm/mremap.c x/mm/mremap.c --- x-ref/mm/mremap.c 2003-01-14 02:37:34.000000000 +0100 +++ x/mm/mremap.c 2003-01-14 02:37:51.000000000 +0100 @@ -270,6 +270,11 @@ unsigned long do_mremap(unsigned long ad vma = find_vma(current->mm, addr); if (!vma || vma->vm_start > addr) goto out; + /* + * Do not remap bigpages, yet. + */ + if (vma->vm_flags & VM_BIGPAGE) + goto out; /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) goto out; diff -urNp x-ref/mm/page_alloc.c x/mm/page_alloc.c --- x-ref/mm/page_alloc.c 2003-01-14 02:37:48.000000000 +0100 +++ x/mm/page_alloc.c 2003-01-14 02:37:51.000000000 +0100 @@ -144,6 +144,7 @@ static void __free_pages_ok (struct page BUG(); if (!VALID_PAGE(page)) BUG(); + BUG_ON(BigPage(page)); if (PageLocked(page)) BUG(); if (PageActive(page)) @@ -267,6 +268,7 @@ static struct page * rmqueue(zone_t *zon set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); + BUG_ON(BigPage(page)); if (PageLRU(page)) BUG(); if (PageActive(page)) @@ -314,6 +316,7 @@ static struct page * balance_classzone(z BUG(); if (!VALID_PAGE(page)) BUG(); + BUG_ON(BigPage(page)); if (PageLocked(page)) BUG(); if (PageLRU(page)) diff -urNp x-ref/mm/shmem.c x/mm/shmem.c --- x-ref/mm/shmem.c 2003-01-14 02:37:33.000000000 +0100 +++ x/mm/shmem.c 2003-01-14 02:40:14.000000000 +0100 @@ -5,6 +5,7 @@ * 2000 Transmeta Corp. * 2000-2001 Christoph Rohland * 2000-2001 SAP AG + * 2002 Ingo Molnar, Red Hat Inc. * * This file is released under the GPL. */ @@ -27,12 +28,15 @@ #include #include #include +#include #include /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994 +int shm_use_bigpages; + #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) @@ -58,6 +62,32 @@ atomic_t shmem_nrpages = ATOMIC_INIT(0); static struct page *shmem_getpage_locked(struct shmem_inode_info *, struct inode *, unsigned long); +static int shm_alloc_space(struct inode * inode, unsigned long space) +{ + struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); + + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks < space) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_blocks -= space; + inode->i_blocks += space*BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + + return 0; +} + +static void shm_free_space(struct inode * inode, unsigned long freed) +{ + struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); + + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks += freed; + inode->i_blocks -= freed*BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); +} + /* * shmem_recalc_inode - recalculate the size of an inode * @@ -84,13 +114,8 @@ static void shmem_recalc_inode(struct in freed = (inode->i_blocks/BLOCKS_PER_PAGE) - (inode->i_mapping->nrpages + SHMEM_I(inode)->swapped); - if (freed){ - struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb); - inode->i_blocks -= freed*BLOCKS_PER_PAGE; - spin_lock (&sbinfo->stat_lock); - sbinfo->free_blocks += freed; - spin_unlock (&sbinfo->stat_lock); - } + if (freed) + shm_free_space(inode, freed); } /* @@ -316,43 +341,57 @@ shmem_truncate_indirect(struct shmem_ino static void shmem_truncate (struct inode * inode) { - unsigned long index; + unsigned long index, i; unsigned long partial; unsigned long freed = 0; struct shmem_inode_info * info = SHMEM_I(inode); + struct page *page; down(&info->sem); inode->i_ctime = inode->i_mtime = CURRENT_TIME; spin_lock (&info->lock); - index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - partial = inode->i_size & ~PAGE_CACHE_MASK; - - if (partial) { - swp_entry_t *entry = shmem_swp_entry(info, index-1, 0); - struct page *page; - /* - * This check is racy: it's faintly possible that page - * was assigned to swap during truncate_inode_pages, - * and now assigned to file; but better than nothing. - */ - if (!IS_ERR(entry) && entry->val) { - spin_unlock(&info->lock); - page = shmem_getpage_locked(info, inode, index-1); - if (!IS_ERR(page)) { - memclear_highpage_flush(page, partial, - PAGE_CACHE_SIZE - partial); - UnlockPage(page); - page_cache_release(page); + if (!info->bigpages) { + index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + partial = inode->i_size & ~PAGE_CACHE_MASK; + + if (partial) { + swp_entry_t *entry = shmem_swp_entry(info, index-1, 0); + struct page *page; + /* + * This check is racy: it's faintly possible that page + * was assigned to swap during truncate_inode_pages, + * and now assigned to file; but better than nothing. + */ + if (!IS_ERR(entry) && entry->val) { + spin_unlock(&info->lock); + page = shmem_getpage_locked(info, inode, index-1); + if (!IS_ERR(page)) { + memclear_highpage_flush(page, partial, + PAGE_CACHE_SIZE - partial); + UnlockPage(page); + page_cache_release(page); + } + spin_lock(&info->lock); } - spin_lock(&info->lock); } - } + while (index < info->next_index) + freed += shmem_truncate_indirect(info, index); - while (index < info->next_index) - freed += shmem_truncate_indirect(info, index); + info->swapped -= freed; + shmem_recalc_inode(inode); + } else { + index = (inode->i_size + BIGPAGE_SIZE - 1) >> BIGPAGE_SHIFT; + + for (i = index; i < info->max_bigpages; i++) { + page = info->bigpages[i]; + if (page) { + info->bigpages[i] = NULL; + free_bigpage(page); + shm_free_space(inode, BIGPAGE_PAGES); + } + } + } - info->swapped -= freed; - shmem_recalc_inode(inode); spin_unlock (&info->lock); up(&info->sem); } @@ -360,6 +399,7 @@ static void shmem_truncate (struct inode static void shmem_delete_inode(struct inode * inode) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); if (inode->i_op->truncate == shmem_truncate) { spin_lock (&shmem_ilock); @@ -367,7 +407,13 @@ static void shmem_delete_inode(struct in spin_unlock (&shmem_ilock); inode->i_size = 0; shmem_truncate (inode); + if (info->bigpages) { + kfree(info->bigpages); + info->bigpages = NULL; + info->max_bigpages = 0; + } } + BUG_ON(info->bigpages); spin_lock (&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock (&sbinfo->stat_lock); @@ -435,6 +481,7 @@ void shmem_unuse(swp_entry_t entry, stru struct list_head *p; struct shmem_inode_info * info; + BUG_ON(BigPage(page)); spin_lock (&shmem_ilock); list_for_each(p, &shmem_inodes) { info = list_entry(p, struct shmem_inode_info, list); @@ -466,6 +513,7 @@ static int shmem_writepage(struct page * if (!PageLocked(page)) BUG(); + BUG_ON(BigPage(page)); if (!PageLaunder(page)) return fail_writepage(page); @@ -527,10 +575,10 @@ getswap: static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode * inode, unsigned long idx) { struct address_space * mapping = inode->i_mapping; - struct shmem_sb_info *sbinfo; struct page * page; swp_entry_t *entry; + BUG_ON(info->bigpages); repeat: page = find_lock_page(mapping, idx); if (page) @@ -594,13 +642,10 @@ repeat: info->swapped--; spin_unlock (&info->lock); } else { - sbinfo = SHMEM_SB(inode->i_sb); - spin_unlock (&info->lock); - spin_lock (&sbinfo->stat_lock); - if (sbinfo->free_blocks == 0) - goto no_space; - sbinfo->free_blocks--; - spin_unlock (&sbinfo->stat_lock); + spin_unlock(&info->lock); + + if (shm_alloc_space(inode, 1)) + return ERR_PTR(-ENOSPC); /* Ok, get a new page. We don't have to worry about the * info->lock spinlock here: we cannot race against @@ -610,20 +655,18 @@ repeat: * new shm entry. The inode semaphore we already hold * is enough to make this atomic. */ page = page_cache_alloc(mapping); - if (!page) + if (!page) { + shm_free_space(inode, 1); return ERR_PTR(-ENOMEM); + } clear_highpage(page); flush_dcache_page(page); - inode->i_blocks += BLOCKS_PER_PAGE; add_to_page_cache (page, mapping, idx); } /* We have the page */ SetPageUptodate(page); return page; -no_space: - spin_unlock (&sbinfo->stat_lock); - return ERR_PTR(-ENOSPC); wait_retry: spin_unlock (&info->lock); @@ -632,6 +675,48 @@ wait_retry: goto repeat; } +static struct page * shmem_getbigpage_locked(struct shmem_inode_info *info, struct inode * inode, unsigned long idx) +{ + unsigned long bigidx, offset; + struct page *page; + + bigidx = idx / BIGPAGE_PAGES; + offset = idx % BIGPAGE_PAGES; + + if (bigidx >= info->max_bigpages) + return ERR_PTR(-ENOSPC); +got_bigpage: + page = info->bigpages[bigidx]; + if (page) { + page += offset; + get_page(page); + BUG_ON(!BigPage(page)); + lock_page(page); + return page; + } + + if (shm_alloc_space(inode, BIGPAGE_PAGES)) + return ERR_PTR(-ENOSPC); + + page = alloc_bigpage(); + if (!page) { + shm_free_space(inode, BIGPAGE_PAGES); + return ERR_PTR(-ENOSPC); + } + + spin_lock(&info->lock); + if (info->bigpages[bigidx]) { + spin_unlock(&info->lock); + free_bigpage(page); + shm_free_space(inode, BIGPAGE_PAGES); + goto got_bigpage; + } + info->bigpages[bigidx] = page; + spin_unlock(&info->lock); + + goto got_bigpage; +} + static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr) { struct shmem_inode_info *info = SHMEM_I(inode); @@ -658,11 +743,55 @@ failed: return error; } +static int shmem_getbigpage(struct inode * inode, unsigned long idx, struct page **ptr) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + down(&info->sem); + *ptr = ERR_PTR(-EFAULT); + if (inode->i_size <= (loff_t) idx * PAGE_SIZE) + goto failed; + + *ptr = shmem_getbigpage_locked(info, inode, idx); + if (IS_ERR (*ptr)) + goto failed; + + UnlockPage(*ptr); + up(&info->sem); + return 0; +failed: + up (&info->sem); + error = PTR_ERR(*ptr); + *ptr = NOPAGE_SIGBUS; + if (error == -ENOMEM) + *ptr = NOPAGE_OOM; + return error; +} + struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused) { - struct page * page; - unsigned int idx; + struct page * page = NULL; struct inode * inode = vma->vm_file->f_dentry->d_inode; + int bigpage = vma->vm_flags & VM_BIGPAGE; + unsigned long idx, bigidx; + + if (I_BIGPAGE(inode)) { + idx = (address - vma->vm_start) >> PAGE_SHIFT; + idx += vma->vm_pgoff; + + if (shmem_getbigpage(inode, idx, &page)) + return page; + + if (bigpage) { + put_page(page); + bigidx = idx / BIGPAGE_PAGES; + BUG_ON(bigidx >= SHMEM_I(inode)->max_bigpages); + page = SHMEM_I(inode)->bigpages[bigidx]; + get_page(page); + } + return page; + } idx = (address - vma->vm_start) >> PAGE_CACHE_SHIFT; idx += vma->vm_pgoff; @@ -684,8 +813,49 @@ void shmem_lock(struct file * file, int up(&info->sem); } +int shmem_make_bigpage_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct inode *inode = file->f_dentry->d_inode; + unsigned long pages; + struct shmem_inode_info *info; + int bigpage; + + /* + * COW of 4MB/2MB pages is ... an interesting concept. Disallow it. + */ + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + info = SHMEM_I(inode); + /* + * Make sure the bigpage area is properly aligned and + * properly sized, both on the virtual and on the + * physical side. + */ + bigpage = 0; + if (shm_use_bigpages) + bigpage = 1; + if (vma->vm_start & BIGPAGE_MASK) + bigpage = 0; + if (vma->vm_end & BIGPAGE_MASK) + bigpage = 0; + if (vma->vm_pgoff % BIGPAGE_PAGES) + bigpage = 0; + + pages = (vma->vm_end - vma->vm_start) / PAGE_SIZE + vma->vm_pgoff; + pages >>= (BIGPAGE_SHIFT - PAGE_SHIFT); + if (pages >= info->max_bigpages) + return -ENOSPC; + + vma->vm_flags |= VM_LOCKED; + if (bigpage) + vma->vm_flags |= VM_BIGPAGE; + return 0; +} + static int shmem_mmap(struct file * file, struct vm_area_struct * vma) { + int error = 0; struct vm_operations_struct * ops; struct inode *inode = file->f_dentry->d_inode; @@ -693,8 +863,31 @@ static int shmem_mmap(struct file * file if (!inode->i_sb || !S_ISREG(inode->i_mode)) return -EACCES; UPDATE_ATIME(inode); + if (SHMEM_I(inode)->bigpages) { + error = shmem_make_bigpage_mmap(file, vma); + if (error) + goto out; + } vma->vm_ops = ops; vma->vm_flags &= ~VM_IO; + out: + return error; +} + +int shmem_munmap(struct vm_area_struct * vma, unsigned long addr, size_t size) +{ + int bigpage = vma->vm_flags & VM_BIGPAGE; + + /* + * Make sure the unmapped bigpage area is properly aligned and + * properly sized: + */ + if (bigpage) { + if (addr & BIGPAGE_MASK) + return -EINVAL; + if (size & BIGPAGE_MASK) + return -EINVAL; + } return 0; } @@ -726,6 +919,7 @@ struct inode *shmem_get_inode(struct sup info->inode = inode; spin_lock_init (&info->lock); sema_init (&info->sem, 1); + BUG_ON(info->bigpages); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -784,14 +978,23 @@ static ssize_t shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { struct inode *inode = file->f_dentry->d_inode; - struct shmem_inode_info *info; + struct shmem_inode_info *info = SHMEM_I(inode); unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + unsigned int page_size, page_shift; loff_t pos; struct page *page; unsigned long written; long status; int err; + if (I_BIGPAGE(inode)) { + page_size = PAGE_SIZE; + page_shift = PAGE_SHIFT; + } else { + page_size = PAGE_CACHE_SIZE; + page_shift = PAGE_CACHE_SHIFT; + } + if ((ssize_t) count < 0) return -EINVAL; @@ -845,9 +1048,9 @@ shmem_file_write(struct file *file,const * Try to find the page in the cache. If it isn't there, * allocate a free page. */ - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; + offset = (pos & (page_size -1)); /* Within page */ + index = pos >> page_shift; + bytes = page_size - offset; if (bytes > count) { bytes = count; } @@ -863,10 +1066,12 @@ shmem_file_write(struct file *file,const __get_user(dummy, buf+bytes-1); } - info = SHMEM_I(inode); - down (&info->sem); - page = shmem_getpage_locked(info, inode, index); - up (&info->sem); + down(&info->sem); + if (I_BIGPAGE(inode)) + page = shmem_getbigpage_locked(info, inode, index); + else + page = shmem_getpage_locked(info, inode, index); + up(&info->sem); status = PTR_ERR(page); if (IS_ERR(page)) @@ -896,7 +1101,10 @@ shmem_file_write(struct file *file,const unlock: /* Mark it unlocked again and drop the page.. */ UnlockPage(page); - page_cache_release(page); + if (I_BIGPAGE(inode)) + __free_page(page); + else + page_cache_release(page); if (status < 0) break; @@ -917,28 +1125,42 @@ static void do_shmem_file_read(struct fi { struct inode *inode = filp->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; + unsigned int page_size, page_shift, page_mask; unsigned long index, offset; - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + if (I_BIGPAGE(inode)) { + page_size = PAGE_SIZE; + page_shift = PAGE_SHIFT; + page_mask = PAGE_MASK; + } else { + page_size = PAGE_CACHE_SIZE; + page_shift = PAGE_CACHE_SHIFT; + page_mask = PAGE_CACHE_MASK; + } + index = *ppos >> page_shift; + offset = *ppos & ~page_mask; for (;;) { struct page *page; unsigned long end_index, nr, ret; - end_index = inode->i_size >> PAGE_CACHE_SHIFT; + end_index = inode->i_size >> page_shift; if (index > end_index) break; - nr = PAGE_CACHE_SIZE; + nr = page_size; if (index == end_index) { - nr = inode->i_size & ~PAGE_CACHE_MASK; + nr = inode->i_size & ~page_mask; if (nr <= offset) break; } nr = nr - offset; - if ((desc->error = shmem_getpage(inode, index, &page))) + if (I_BIGPAGE(inode)) + desc->error = shmem_getbigpage(inode, index, &page); + else + desc->error = shmem_getpage(inode, index, &page); + if (desc->error) break; if (mapping->i_mmap_shared != NULL) @@ -956,15 +1178,18 @@ static void do_shmem_file_read(struct fi */ ret = file_read_actor(desc, page, offset, nr); offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; - - page_cache_release(page); + index += offset >> page_shift; + offset &= ~page_mask; + + if (I_BIGPAGE(inode)) + __free_page(page); + else + page_cache_release(page); if (ret != nr || !desc->count) break; } - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + *ppos = ((loff_t) index << page_shift) + offset; UPDATE_ATIME(inode); } @@ -1033,6 +1258,8 @@ static int shmem_mknod(struct inode *dir d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; + if (shm_use_bigpages > 1) + shm_enable_bigpages(dentry->d_inode); } return error; } @@ -1381,7 +1608,25 @@ static struct super_block *shmem_read_su return sb; } +void shm_enable_bigpages(struct inode *inode) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); + if (!S_ISREG(inode->i_mode)) + return; + if (info->bigpages || !nr_bigpages) + return; + info->max_bigpages = sbinfo->max_blocks >> (BIGPAGE_SHIFT - PAGE_CACHE_SHIFT); + if (info->max_bigpages > MAX_BIGPAGES) + info->max_bigpages = MAX_BIGPAGES; + if (!info->max_bigpages) + return; + info->bigpages = (struct page **) kmalloc(info->max_bigpages * sizeof(struct page *), GFP_KERNEL); + if (!info->bigpages) + return; + memset(info->bigpages, 0, info->max_bigpages * sizeof(struct page *)); +} static struct address_space_operations shmem_aops = { writepage: shmem_writepage, @@ -1389,6 +1634,7 @@ static struct address_space_operations s static struct file_operations shmem_file_operations = { mmap: shmem_mmap, + munmap: shmem_munmap, #ifdef CONFIG_TMPFS read: shmem_file_read, write: shmem_file_write, diff -urNp x-ref/mm/swapfile.c x/mm/swapfile.c --- x-ref/mm/swapfile.c 2003-01-14 02:37:48.000000000 +0100 +++ x/mm/swapfile.c 2003-01-14 02:37:51.000000000 +0100 @@ -406,6 +406,8 @@ static inline void unuse_pmd(struct vm_a if (pmd_none(*dir)) return; + if (pmd_bigpage(*dir)) + return; if (pmd_bad(*dir)) { pmd_ERROR(*dir); pmd_clear(dir); diff -urNp x-ref/mm/vmscan.c x/mm/vmscan.c --- x-ref/mm/vmscan.c 2003-01-14 02:37:36.000000000 +0100 +++ x/mm/vmscan.c 2003-01-14 02:37:51.000000000 +0100 @@ -257,7 +257,7 @@ static inline int swap_out_vma(struct mm unsigned long end; /* Don't swap out areas which are reserved */ - if (vma->vm_flags & VM_RESERVED) + if (vma->vm_flags & (VM_RESERVED|VM_BIGPAGE)) return count; pgdir = pgd_offset(mm, address);