# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1101 -> 1.1110 # mm/oom_kill.c 1.9.1.2 -> 1.13 # include/asm-ppc64/pgtable.h 1.5 -> 1.7 # arch/ppc64/kernel/pmc.c 1.4 -> 1.5 # arch/i386/mm/fault.c 1.13 -> 1.14 # drivers/sgi/char/graphics.c 1.6 -> 1.7 # include/asm-s390x/page.h 1.7 -> 1.8 # include/linux/list.h 1.9 -> 1.10 # include/linux/mmzone.h 1.7.1.2 -> 1.19 # include/linux/pagemap.h 1.19 -> 1.21 # kernel/ksyms.c 1.64.1.3 -> 1.67 # include/linux/swap.h 1.31.1.5 -> 1.44 # include/linux/elevator.h 1.4.1.1 -> 1.7 # include/linux/mm.h 1.38.1.2 -> 1.55 # fs/proc/array.c 1.9 -> 1.10 # include/asm-i386/fixmap.h 1.4.1.1 -> 1.6 # include/asm-i386/pgalloc.h 1.9 -> 1.10 # mm/mmap.c 1.23.1.6 -> 1.28 # drivers/s390/ccwcache.c 1.3.1.1 -> 1.5 # mm/page_alloc.c 1.43.1.13 -> 1.64 # include/asm-i386/highmem.h 1.4.1.1 -> 1.6 # drivers/char/drm/drm_proc.h 1.5 -> 1.6 # include/linux/sched.h 1.29.1.7 -> 1.34 # kernel/fork.c 1.23.1.4 -> 1.27 # drivers/block/ll_rw_blk.c 1.34.1.9 -> 1.38 # kernel/sys.c 1.9.1.3 -> 1.12 # kernel/sysctl.c 1.16.1.3 -> 1.21 # arch/i386/kernel/vm86.c 1.5 -> 1.6 # Makefile 1.161.1.46 -> 1.182 # include/linux/swapctl.h 1.2 -> 1.4 # fs/dcache.c 1.16.1.6 -> 1.23 # fs/dquot.c 1.16.1.1 -> 1.19 # mm/vmscan.c 1.59.1.3 -> 1.97 # fs/proc/proc_misc.c 1.13.1.7 -> 1.20 # include/asm-s390x/pgalloc.h 1.6 -> 1.7 # arch/s390/mm/ioremap.c 1.3 -> 1.4 # include/asm-i386/page.h 1.10 -> 1.11 # include/asm-s390/page.h 1.6 -> 1.7 # init/main.c 1.26 -> 1.27 # arch/s390/mm/init.c 1.8 -> 1.9 # arch/ppc64/mm/init.c 1.4 -> 1.5 # arch/x86_64/mm/pageattr.c 1.2 -> 1.3 # arch/x86_64/mm/init.c 1.5 -> 1.6 # mm/swapfile.c 1.23.1.3 -> 1.28 # arch/i386/mm/init.c 1.13.1.1 -> 1.15 # include/linux/fs.h 1.60.1.14 -> 1.68 # arch/i386/mm/Makefile 1.2 -> 1.3 # arch/i386/mm/ioremap.c 1.4 -> 1.5 # mm/numa.c 1.4 -> 1.6 # mm/bootmem.c 1.6.1.3 -> 1.9 # arch/x86_64/mm/ioremap.c 1.3 -> 1.4 # arch/i386/config.in 1.35.1.6 -> 1.37 # arch/s390x/mm/init.c 1.9 -> 1.10 # arch/i386/mm/pageattr.c 1.2 -> 1.3 # mm/filemap.c 1.62.1.14 -> 1.85 # include/linux/brlock.h 1.3 -> 1.4 # fs/exec.c 1.20.1.4 -> 1.26 # mm/swap.c 1.16.1.1 -> 1.27 # mm/mprotect.c 1.4 -> 1.5 # include/asm-s390x/pgtable.h 1.8 -> 1.9 # mm/shmem.c 1.45.1.1 -> 1.48 # mm/swap_state.c 1.17.1.1 -> 1.21 # include/asm-x86_64/pgtable.h 1.4 -> 1.5 # mm/memory.c 1.50.1.3 -> 1.60 # include/linux/slab.h 1.8.1.2 -> 1.12 # arch/i386/kernel/setup.c 1.37.1.26 -> 1.43 # arch/s390x/mm/ioremap.c 1.2 -> 1.3 # drivers/char/mem.c 1.17 -> 1.18 # include/asm-ppc64/pgalloc.h 1.2 -> 1.4 # include/asm-i386/pgtable-2level.h 1.3 -> 1.4 # mm/mremap.c 1.5 -> 1.8 # mm/vmalloc.c 1.13.1.2 -> 1.15 # fs/buffer.c 1.61.1.21 -> 1.69 # include/asm-s390/pgalloc.h 1.6 -> 1.7 # include/asm-x86_64/page.h 1.3 -> 1.4 # include/asm-ppc64/page.h 1.4 -> 1.5 # mm/Makefile 1.5 -> 1.6 # arch/arm/mm/mm-armv.c 1.5 -> 1.6 # arch/ppc64/kernel/htab.c 1.5 -> 1.6 # include/asm-i386/kmap_types.h 1.6 -> 1.7 # drivers/ieee1394/ieee1394_types.h 1.10 -> 1.11 # include/asm-i386/pgtable.h 1.8 -> 1.9 # include/linux/highmem.h 1.10.1.1 -> 1.12 # drivers/block/elevator.c 1.5.1.3 -> 1.8 # mm/slab.c 1.14.1.8 -> 1.21 # include/linux/module.h 1.12 -> 1.13 # include/asm-s390/pgtable.h 1.8 -> 1.9 # include/asm-i386/pgtable-3level.h 1.4 -> 1.5 # include/asm-x86_64/io.h 1.3 -> 1.4 # include/asm-x86_64/pgalloc.h 1.2 -> 1.3 # fs/inode.c 1.32.1.4 -> 1.38 # arch/x86_64/mm/fault.c 1.7 -> 1.8 # (new) -> 1.1 include/asm-alpha/rmap.h # (new) -> 1.14 mm/rmap.c # (new) -> 1.1 include/asm-sparc64/rmap.h # (new) -> 1.2 include/asm-i386/rmap.h # (new) -> 1.1 arch/i386/mm/pgtable.c # (new) -> 1.2 include/asm-generic/rmap.h # (new) -> 1.1 include/asm-s390x/rmap.h # (new) -> 1.1 include/asm-ppc/rmap.h # (new) -> 1.1 include/asm-arm/rmap.h # (new) -> 1.1 include/asm-mips/rmap.h # (new) -> 1.9 include/linux/mm_inline.h # (new) -> 1.1 include/asm-s390/rmap.h # (new) -> 1.1 include/asm-sh/rmap.h # (new) -> 1.1 include/asm-m68k/rmap.h # (new) -> 1.1 include/asm-ia64/rmap.h # (new) -> 1.1 include/asm-x86_64/rmap.h # (new) -> 1.22 Changelog.rmap # (new) -> 1.1 mm/TODO # (new) -> 1.1 include/asm-sparc/rmap.h # (new) -> 1.1 include/asm-mips64/rmap.h # (new) -> 1.1 include/asm-arm/proc-armv/rmap.h # (new) -> 1.1 include/asm-ppc64/rmap.h # (new) -> 1.1 include/asm-cris/rmap.h # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/04/03 riel@chimarrao.boston.redhat.com 1.757.33.29 # add barrier() to page_chain_lock() (Pete Zaitcev) # -------------------------------------------- # 03/04/11 riel@cluless.boston.redhat.com 1.757.33.30 # pte-highmem defines for ppc64 (Julie DeWandel) # -------------------------------------------- # 03/04/11 riel@chimarrao.boston.redhat.com 1.757.33.31 # pte-highmem updates for s390 and s390x (Pete Zaitcev) # -------------------------------------------- # 03/04/11 riel@cluless.boston.redhat.com 1.757.33.32 # rmap 15f release # -------------------------------------------- # 03/04/12 riel@imladris.surriel.com 1.1102 # merge up to 2.4.21-pre7 # -------------------------------------------- # 03/04/12 riel@imladris.surriel.com 1.1103 # fix up /dev/mem pte-highmem awareness # -------------------------------------------- # 03/04/15 riel@chimarrao.boston.redhat.com 1.757.33.33 # ooops, forgot to add asm-ppc64/rmap.h ... # -------------------------------------------- # 03/04/15 riel@cluless.boston.redhat.com 1.1104 # Merge linuxvm@linuxvm.bkbits.net:linux-2.4-rmap # into cluless.boston.redhat.com:/home/boston/riel/bk/linux-2.4-rmap # -------------------------------------------- # 03/04/16 riel@chimarrao.boston.redhat.com 1.1105 # additional ppc64 pte-highmem fix # -------------------------------------------- # 03/04/16 riel@chimarrao.boston.redhat.com 1.1106 # yet another piece of the ppc64 pte-highmem puzzle ;) # -------------------------------------------- # 03/04/16 riel@chimarrao.boston.redhat.com 1.1107 # x86_64 rmap bits (Jim Paradis) # -------------------------------------------- # 03/04/17 riel@cluless.boston.redhat.com 1.1108 # awwww shucks, forgot to check in asm-x86_64/rmap.h ... # -------------------------------------------- # 03/04/17 riel@cluless.boston.redhat.com 1.1109 # reclaim buffer heads under memory pressure # -------------------------------------------- # 03/04/17 riel@cluless.boston.redhat.com 1.1110 # rmap 15g release # -------------------------------------------- # diff -Nru a/Changelog.rmap b/Changelog.rmap --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/Changelog.rmap Thu Apr 17 15:25:14 2003 @@ -0,0 +1,245 @@ +The seventh maintenance release of the 15th version of the reverse +mapping based VM is now available. +This is an attempt at making a more robust and flexible VM +subsystem, while cleaning up a lot of code at the same time. +The patch is available from: + + http://surriel.com/patches/2.4/2.4.20-rmap15g +and http://linuxvm.bkbits.net/ + + +My big TODO items for a next release are: + - finetune the O(1) VM code for strange corner cases + - add pte-highmem defines for more architectures + - highmem tweaks + + +rmap 15g: + - more ppc64 pte-highmem stuff (Julie DeWandel) + - hammer pte-highmem stuff (Jim Paradis) + - reclaim buffer heads under memory pressure (me) +rmap 15f: + - remove pte-highmem compat define from ieee1394 (Marc-C. Petersen) + - clean up scan_active_list after suggestion from hch (me) + - lock ordering fix (me) + - add barrier() to page_chain_lock() (Pete Zaitcev) + - fix pte-highmem defines for ppc64 (Julie DeWandel) + - add pte-highmem defines for s390 & s390x (Pete Zaitcev) +rmap 15e: + - make reclaiming unused inodes more efficient (Arjan van de Ven) + | push to Marcelo and Andrew once it's well tested ! + - fix DRM memory leak (Arjan van de Ven) + - fix potential infinite loop in kswapd (me) + - clean up elevator.h (no IO scheduler in -rmap...) (me) + - page aging interval tuned on a per zone basis, better + wakeup mechanism for sudden memory pressure (Arjan, me) +rmap 15d: + - compatability with PREEMPT patch (me) + | fairly ugly, but should work + - bugfix for the pte_chain allocation code (Arjan van de Ven) +rmap 15c: + - backport and audit akpm's reliable pte_chain alloc + code from 2.5 (me) + - reintroduce cache size tuning knobs in /proc (me) + | on very, very popular request +rmap 15b: + - adjust anon/cache work table (me) + - make active_age_bias a per-active list thing (me) + - don't wake up kswapd early from mark_page_accessed (me) + - make sure pte-chains are cacheline aligned with PAE (me, Andrew Morton) + - change some O(1) VM thresholds (me) + - fix pte-highmem backport (me) + - 2.5 backport: pte-highmem (Ben LaHaise) + - 2.5 backport: large cacheline aligned pte-chains (Ben LaHaise) + - 2.5 backport: direct pte pointers (Ben LaHaise) + - undo __find_pagecache_page braindamage (Christoph Hellwig) +rmap 15a: + - more agressive freeing for higher order allocations (me) + - export __find_pagecache_page, find_get_page define (me, Christoph, Arjan) + - make memory statistics SMP safe again (me) + - make page aging slow down again when needed (Andrew Morton) + - first stab at fine-tuning arjan's O(1) VM (me) + - split active list in cache / working set (me) + - fix SMP locking in arjan's O(1) VM (me) +rmap 15: + - small code cleanups and spelling fixes for O(1) VM (me) + - O(1) page launder, O(1) page aging (Arjan van de Ven) + - resync code with -ac (12 small patches) (me) +rmap 14c: + - fold page_over_rsslimit() into page_referenced() (me) + - 2.5 backport: get pte_chains from the slab cache (William Lee Irwin) + - remove dead code from page_launder_zone() (me) + - make OOM detection a bit more agressive (me) +rmap 14b: + - don't unmap pages not in pagecache (ext3 & reiser) (Andrew Morton, me) + - clean up mark_page_accessed a bit (me) + - Alpha NUMA fix for Ingo's per-cpu pages (Flávio Leitner, me) + - remove explicit low latency schedule zap_page_range (Robert Love) + - fix OOM stuff for good, hopefully (me) +rmap 14a: + - Ingo Molnar's per-cpu pages (SMP speedup) (Christoph Hellwig) + - fix SMP bug in page_launder_zone (rmap14 only) (Arjan van de Ven) + - semicolon day, fix typo in rmap.c w/ DEBUG_RMAP (Craig Kulesa) + - remove unneeded pte_chain_unlock/lock pair vmscan.c (Craig Kulesa) + - low latency zap_page_range also without preempt (Arjan van de Ven) + - do some throughput tuning for kswapd/page_launder (me) + - don't allocate swap space for pages we're not writing (me) +rmap 14: + - get rid of stalls during swapping, hopefully (me) + - low latency zap_page_range (Robert Love) +rmap 13c: + - add wmb() to wakeup_memwaiters (Arjan van de Ven) + - remap_pmd_range now calls pte_alloc with full address (Paul Mackerras) + - #ifdef out pte_chain_lock/unlock on UP machines (Andrew Morton) + - un-BUG() truncate_complete_page, the race is expected (Andrew Morton, me) + - remove NUMA changes from rmap13a (Christoph Hellwig) +rmap 13b: + - prevent PF_MEMALLOC recursion for higher order allocs (Arjan van de Ven, me) + - fix small SMP race, PG_lru (Hugh Dickins) +rmap 13a: + - NUMA changes for page_address (Samuel Ortiz) + - replace vm.freepages with simpler kswapd_minfree (Christoph Hellwig) +rmap 13: + - rename touch_page to mark_page_accessed and uninline (Christoph Hellwig) + - NUMA bugfix for __alloc_pages (William Irwin) + - kill __find_page (Christoph Hellwig) + - make pte_chain_freelist per zone (William Irwin) + - protect pte_chains by per-page lock bit (William Irwin) + - minor code cleanups (me) +rmap 12i: + - slab cleanup (Christoph Hellwig) + - remove references to compiler.h from mm/* (me) + - move rmap to marcelo's bk tree (me) + - minor cleanups (me) +rmap 12h: + - hopefully fix OOM detection algorithm (me) + - drop pte quicklist in anticipation of pte-highmem (me) + - replace andrea's highmem emulation by ingo's one (me) + - improve rss limit checking (Nick Piggin) +rmap 12g: + - port to armv architecture (David Woodhouse) + - NUMA fix to zone_table initialisation (Samuel Ortiz) + - remove init_page_count (David Miller) +rmap 12f: + - for_each_pgdat macro (William Lee Irwin) + - put back EXPORT(__find_get_page) for modular rd (me) + - make bdflush and kswapd actually start queued disk IO (me) +rmap 12e + - RSS limit fix, the limit can be 0 for some reason (me) + - clean up for_each_zone define to not need pgdata_t (William Lee Irwin) + - fix i810_dma bug introduced with page->wait removal (William Lee Irwin) +rmap 12d: + - fix compiler warning in rmap.c (Roger Larsson) + - read latency improvement (read-latency2) (Andrew Morton) +rmap 12c: + - fix small balancing bug in page_launder_zone (Nick Piggin) + - wakeup_kswapd / wakeup_memwaiters code fix (Arjan van de Ven) + - improve RSS limit enforcement (me) +rmap 12b: + - highmem emulation (for debugging purposes) (Andrea Arcangeli) + - ulimit RSS enforcement when memory gets tight (me) + - sparc64 page->virtual quickfix (Greg Procunier) +rmap 12a: + - fix the compile warning in buffer.c (me) + - fix divide-by-zero on highmem initialisation DOH! (me) + - remove the pgd quicklist (suspicious ...) (DaveM, me) +rmap 12: + - keep some extra free memory on large machines (Arjan van de Ven, me) + - higher-order allocation bugfix (Adrian Drzewiecki) + - nr_free_buffer_pages() returns inactive + free mem (me) + - pages from unused objects directly to inactive_clean (me) + - use fast pte quicklists on non-pae machines (Andrea Arcangeli) + - remove sleep_on from wakeup_kswapd (Arjan van de Ven) + - page waitqueue cleanup (Christoph Hellwig) +rmap 11c: + - oom_kill race locking fix (Andres Salomon) + - elevator improvement (Andrew Morton) + - dirty buffer writeout speedup (hopefully ;)) (me) + - small documentation updates (me) + - page_launder() never does synchronous IO, kswapd + and the processes calling it sleep on higher level (me) + - deadlock fix in touch_page() (me) +rmap 11b: + - added low latency reschedule points in vmscan.c (me) + - make i810_dma.c include mm_inline.h too (William Lee Irwin) + - wake up kswapd sleeper tasks on OOM kill so the + killed task can continue on its way out (me) + - tune page allocation sleep point a little (me) +rmap 11a: + - don't let refill_inactive() progress count for OOM (me) + - after an OOM kill, wait 5 seconds for the next kill (me) + - agpgart_be fix for hashed waitqueues (William Lee Irwin) +rmap 11: + - fix stupid logic inversion bug in wakeup_kswapd() (Andrew Morton) + - fix it again in the morning (me) + - add #ifdef BROKEN_PPC_PTE_ALLOC_ONE to rmap.h, it + seems PPC calls pte_alloc() before mem_map[] init (me) + - disable the debugging code in rmap.c ... the code + is working and people are running benchmarks (me) + - let the slab cache shrink functions return a value + to help prevent early OOM killing (Ed Tomlinson) + - also, don't call the OOM code if we have enough + free pages (me) + - move the call to lru_cache_del into __free_pages_ok (Ben LaHaise) + - replace the per-page waitqueue with a hashed + waitqueue, reduces size of struct page from 64 + bytes to 52 bytes (48 bytes on non-highmem machines) (William Lee Irwin) +rmap 10: + - fix the livelock for real (yeah right), turned out + to be a stupid bug in page_launder_zone() (me) + - to make sure the VM subsystem doesn't monopolise + the CPU, let kswapd and some apps sleep a bit under + heavy stress situations (me) + - let __GFP_HIGH allocations dig a little bit deeper + into the free page pool, the SCSI layer seems fragile (me) +rmap 9: + - improve comments all over the place (Michael Cohen) + - don't panic if page_remove_rmap() cannot find the + rmap in question, it's possible that the memory was + PG_reserved and belonging to a driver, but the driver + exited and cleared the PG_reserved bit (me) + - fix the VM livelock by replacing > by >= in a few + critical places in the pageout code (me) + - treat the reclaiming of an inactive_clean page like + allocating a new page, calling try_to_free_pages() + and/or fixup_freespace() if required (me) + - when low on memory, don't make things worse by + doing swapin_readahead (me) +rmap 8: + - add ANY_ZONE to the balancing functions to improve + kswapd's balancing a bit (me) + - regularize some of the maximum loop bounds in + vmscan.c for cosmetic purposes (William Lee Irwin) + - move page_address() to architecture-independent + code, now the removal of page->virtual is portable (William Lee Irwin) + - speed up free_area_init_core() by doing a single + pass over the pages and not using atomic ops (William Lee Irwin) + - documented the buddy allocator in page_alloc.c (William Lee Irwin) +rmap 7: + - clean up and document vmscan.c (me) + - reduce size of page struct, part one (William Lee Irwin) + - add rmap.h for other archs (untested, not for ARM) (me) +rmap 6: + - make the active and inactive_dirty list per zone, + this is finally possible because we can free pages + based on their physical address (William Lee Irwin) + - cleaned up William's code a bit (me) + - turn some defines into inlines and move those to + mm_inline.h (the includes are a mess ...) (me) + - improve the VM balancing a bit (me) + - add back inactive_target to /proc/meminfo (me) +rmap 5: + - fixed recursive buglet, introduced by directly + editing the patch for making rmap 4 ;))) (me) +rmap 4: + - look at the referenced bits in page tables (me) +rmap 3: + - forgot one FASTCALL definition (me) +rmap 2: + - teach try_to_unmap_one() about mremap() (me) + - don't assign swap space to pages with buffers (me) + - make the rmap.c functions FASTCALL / inline (me) +rmap 1: + - fix the swap leak in rmap 0 (Dave McCracken) +rmap 0: + - port of reverse mapping VM to 2.4.16 (me) diff -Nru a/Makefile b/Makefile --- a/Makefile Thu Apr 17 15:25:14 2003 +++ b/Makefile Thu Apr 17 15:25:14 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 21 -EXTRAVERSION = -pre7 +EXTRAVERSION = -pre7-rmap15g KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nru a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c --- a/arch/arm/mm/mm-armv.c Thu Apr 17 15:25:14 2003 +++ b/arch/arm/mm/mm-armv.c Thu Apr 17 15:25:14 2003 @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -470,6 +471,7 @@ * cache implementation. */ kmem_cache_t *pte_cache; +kmem_cache_t *pte_rmap_cache; /* * The constructor gets called for each object within the cache when the @@ -480,6 +482,22 @@ { unsigned long block = (unsigned long)pte; + if (!(block & 2048)) { + /* First object of two in a page - allocate the + pte_rmap_info to go with them */ + + struct page * page = virt_to_page(pte); + + if (flags & SLAB_CTOR_ATOMIC) + BUG(); + + page->mapping = kmem_cache_alloc(pte_rmap_cache, GFP_KERNEL); + if (!page->mapping) { + printk(KERN_CRIT "pte_rmap_cache alloc failed. Oops. Slab constructors need to be allowed to fail\n"); + /* return -ENOMEM; */ + BUG(); + } + } if (block & 2047) BUG(); @@ -488,11 +506,32 @@ PTRS_PER_PTE * sizeof(pte_t), 0); } +static void pte_cache_dtor(void *pte, kmem_cache_t *cache, unsigned long flags) +{ + unsigned long block = (unsigned long)pte; + + if (!(block & 2048)) { + /* First object of two in a page - free the + pte_rmap_info that was associated with them */ + + struct page * page = virt_to_page(pte); + + kmem_cache_free(pte_rmap_cache, page->mapping); + page->mapping = NULL; + } +} + void __init pgtable_cache_init(void) { + pte_rmap_cache = kmem_cache_create("pte-rmap-cache", + 2 * sizeof(struct arm_rmap_info), 0, 0, + NULL, NULL); + if (!pte_rmap_cache) + BUG(); + pte_cache = kmem_cache_create("pte-cache", 2 * PTRS_PER_PTE * sizeof(pte_t), 0, 0, - pte_cache_ctor, NULL); + pte_cache_ctor, pte_cache_dtor); if (!pte_cache) BUG(); } diff -Nru a/arch/i386/config.in b/arch/i386/config.in --- a/arch/i386/config.in Thu Apr 17 15:25:14 2003 +++ b/arch/i386/config.in Thu Apr 17 15:25:14 2003 @@ -207,6 +207,7 @@ 64GB CONFIG_HIGHMEM64G" off if [ "$CONFIG_HIGHMEM4G" = "y" -o "$CONFIG_HIGHMEM64G" = "y" ]; then define_bool CONFIG_HIGHMEM y + define_bool CONFIG_HIGHPTE y else define_bool CONFIG_HIGHMEM n fi diff -Nru a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c --- a/arch/i386/kernel/vm86.c Thu Apr 17 15:25:14 2003 +++ b/arch/i386/kernel/vm86.c Thu Apr 17 15:25:14 2003 @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -121,7 +122,7 @@ { pgd_t *pgd; pmd_t *pmd; - pte_t *pte; + pte_t *pte, *mapped; int i; spin_lock(&tsk->mm->page_table_lock); @@ -141,12 +142,13 @@ pmd_clear(pmd); goto out; } - pte = pte_offset(pmd, 0xA0000); + mapped = pte = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); + ptep_set_wrprotect(pte); pte++; } + pte_unmap(mapped); out: spin_unlock(&tsk->mm->page_table_lock); flush_tlb(); diff -Nru a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile --- a/arch/i386/mm/Makefile Thu Apr 17 15:25:14 2003 +++ b/arch/i386/mm/Makefile Thu Apr 17 15:25:14 2003 @@ -9,7 +9,7 @@ O_TARGET := mm.o -obj-y := init.o fault.o ioremap.o extable.o pageattr.o +obj-y := init.o fault.o ioremap.o extable.o pageattr.o pgtable.o export-objs := pageattr.o include $(TOPDIR)/Rules.make diff -Nru a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c --- a/arch/i386/mm/fault.c Thu Apr 17 15:25:14 2003 +++ b/arch/i386/mm/fault.c Thu Apr 17 15:25:14 2003 @@ -392,7 +392,7 @@ goto no_context; set_pmd(pmd, *pmd_k); - pte_k = pte_offset(pmd_k, address); + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; diff -Nru a/arch/i386/mm/init.c b/arch/i386/mm/init.c --- a/arch/i386/mm/init.c Thu Apr 17 15:25:14 2003 +++ b/arch/i386/mm/init.c Thu Apr 17 15:25:14 2003 @@ -45,6 +45,8 @@ int do_check_pgt_cache(int low, int high) { + return 0; /* FIXME! */ +#if 0 int freed = 0; if(pgtable_cache_size > high) { do { @@ -63,6 +65,7 @@ } while(pgtable_cache_size > low); } return freed; +#endif } /* @@ -76,7 +79,7 @@ pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ - pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { @@ -90,36 +93,6 @@ } #endif /* CONFIG_HIGHMEM */ -void show_mem(void) -{ - int i, total = 0, reserved = 0; - int shared = 0, cached = 0; - int highmem = 0; - - printk("Mem-info:\n"); - show_free_areas(); - printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); - i = max_mapnr; - while (i-- > 0) { - total++; - if (PageHighMem(mem_map+i)) - highmem++; - if (PageReserved(mem_map+i)) - reserved++; - else if (PageSwapCache(mem_map+i)) - cached++; - else if (page_count(mem_map+i)) - shared += page_count(mem_map+i) - 1; - } - printk("%d pages of RAM\n", total); - printk("%d pages of HIGHMEM\n",highmem); - printk("%d reserved pages\n",reserved); - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); - printk("%ld pages in page table cache\n",pgtable_cache_size); - show_buffers(); -} - /* References to section boundaries */ extern char _text, _etext, _edata, __bss_start, _end; @@ -142,7 +115,7 @@ printk("PAE BUG #01!\n"); return; } - pte = pte_offset(pmd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); /* stored as-is, to permit clearing entries */ set_pte(pte, mk_pte_phys(phys, flags)); @@ -153,17 +126,6 @@ __flush_tlb_one(vaddr); } -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); - return; - } - set_pte_phys(address, phys, flags); -} - static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { pgd_t *pgd; @@ -193,7 +155,7 @@ if (pmd_none(*pmd)) { pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte))); - if (pte != pte_offset(pmd, 0)) + if (pte != pte_offset_kernel(pmd, 0)) BUG(); } vaddr += PMD_SIZE; @@ -264,7 +226,7 @@ *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL); } set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); - if (pte_base != pte_offset(pmd, 0)) + if (pte_base != pte_offset_kernel(pmd, 0)) BUG(); } @@ -286,7 +248,7 @@ pgd = swapper_pg_dir + __pgd_offset(vaddr); pmd = pmd_offset(pgd, vaddr); - pte = pte_offset(pmd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; #endif @@ -397,7 +359,7 @@ pgd = swapper_pg_dir + __pgd_offset(vaddr); pmd = pmd_offset(pgd, vaddr); - pte = pte_offset(pmd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); old_pte = *pte; *pte = mk_pte_phys(0, PAGE_READONLY); local_flush_tlb(); diff -Nru a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c --- a/arch/i386/mm/ioremap.c Thu Apr 17 15:25:14 2003 +++ b/arch/i386/mm/ioremap.c Thu Apr 17 15:25:14 2003 @@ -49,7 +49,7 @@ if (address >= end) BUG(); do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); diff -Nru a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c --- a/arch/i386/mm/pageattr.c Thu Apr 17 15:25:14 2003 +++ b/arch/i386/mm/pageattr.c Thu Apr 17 15:25:14 2003 @@ -29,7 +29,7 @@ return NULL; if (pmd_val(*pmd) & _PAGE_PSE) return (pte_t *)pmd; - return pte_offset(pmd, address); + return pte_offset_kernel(pmd, address); } static struct page *split_large_page(unsigned long address, pgprot_t prot) diff -Nru a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/i386/mm/pgtable.c Thu Apr 17 15:25:14 2003 @@ -0,0 +1,226 @@ +/* + * linux/arch/i386/mm/pgtable.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +//#include + +void show_mem(void) +{ + int total = 0, reserved = 0; + int shared = 0, cached = 0; + int highmem = 0; + struct page *page; + pg_data_t *pgdat; + unsigned long i; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); + for_each_pgdat(pgdat) { + for (i = 0; i < pgdat->node_size; ++i) { + page = pgdat->node_mem_map + i; + total++; + if (PageHighMem(page)) + highmem++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + } + printk("%d pages of RAM\n", total); + printk("%d pages of HIGHMEM\n",highmem); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + __pgd_offset(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pmd = pmd_offset(pgd, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte(pfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk ("set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk ("set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + __pgd_offset(vaddr); + if (pgd_none(*pgd)) { + printk ("set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pmd = pmd_offset(pgd, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_pfn(address, phys >> PAGE_SHIFT, flags); +} + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + int count = 0; + pte_t *pte; + + do { + pte = (pte_t *) __get_free_page(GFP_KERNEL); + if (pte) + clear_page(pte); + else { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ); + } + } while (!pte && (count++ < 10)); + return pte; +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + int count = 0; + struct page *pte; + + do { +#if CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0); +#else + pte = alloc_pages(GFP_KERNEL, 0); +#endif + if (pte) + clear_highpage(pte); + else { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ); + } + } while (!pte && (count++ < 10)); + return pte; +} + +#if CONFIG_X86_PAE + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i; + pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); + + if (pgd) { + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + unsigned long pmd = __get_free_page(GFP_KERNEL); + if (!pmd) + goto out_oom; + clear_page(pmd); + set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + } + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + } + return pgd; +out_oom: + for (i--; i >= 0; i--) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); + return NULL; +} + +void pgd_free(pgd_t *pgd) +{ + int i; + + for (i = 0; i < USER_PTRS_PER_PGD; i++) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); +} + +#else + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + + if (pgd) { + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + } + return pgd; +} + +void pgd_free(pgd_t *pgd) +{ + free_page((unsigned long)pgd); +} + +#endif /* CONFIG_X86_PAE */ + diff -Nru a/arch/ppc64/kernel/htab.c b/arch/ppc64/kernel/htab.c --- a/arch/ppc64/kernel/htab.c Thu Apr 17 15:25:14 2003 +++ b/arch/ppc64/kernel/htab.c Thu Apr 17 15:25:14 2003 @@ -288,7 +288,7 @@ if (!pgd_none(*pg)) { pm = pmd_offset(pg, ea); if (!pmd_none(*pm)) { - pt = pte_offset(pm, ea); + pt = pte_offset_kernel(pm, ea); pte = *pt; if (!pte_present(pte)) pt = NULL; diff -Nru a/arch/ppc64/kernel/pmc.c b/arch/ppc64/kernel/pmc.c --- a/arch/ppc64/kernel/pmc.c Thu Apr 17 15:25:14 2003 +++ b/arch/ppc64/kernel/pmc.c Thu Apr 17 15:25:14 2003 @@ -248,7 +248,7 @@ */ pgdp = pgd_offset_b(ea); pmdp = pmd_alloc(&btmalloc_mm, pgdp, ea); - ptep = pte_alloc(&btmalloc_mm, pmdp, ea); + ptep = pte_alloc_kernel(&btmalloc_mm, pmdp, ea); pte = *ptep; /* Clear any old hpte and set the new linux pte */ diff -Nru a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c --- a/arch/ppc64/mm/init.c Thu Apr 17 15:25:14 2003 +++ b/arch/ppc64/mm/init.c Thu Apr 17 15:25:14 2003 @@ -107,6 +107,7 @@ { int freed = 0; +#if 0 if (pgtable_cache_size > high) { do { if (pgd_quicklist) @@ -117,6 +118,7 @@ free_page((unsigned long)pte_alloc_one_fast(0, 0)), ++freed; } while (pgtable_cache_size > low); } +#endif return freed; } @@ -246,7 +248,7 @@ spin_lock(&ioremap_mm.page_table_lock); pgdp = pgd_offset_i(ea); pmdp = pmd_alloc(&ioremap_mm, pgdp, ea); - ptep = pte_alloc(&ioremap_mm, pmdp, ea); + ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); pa = absolute_to_phys(pa); set_pte(ptep, mk_pte_phys(pa & PAGE_MASK, __pgprot(flags))); @@ -331,7 +333,7 @@ if (!pgd_none(*pgd)) { pmd = pmd_offset(pgd, vmaddr); if (!pmd_none(*pmd)) { - ptep = pte_offset(pmd, vmaddr); + ptep = pte_offset_map(pmd, vmaddr); /* Check if HPTE might exist and flush it if so */ if (pte_val(*ptep) & _PAGE_HASHPTE) flush_hash_page(context, vmaddr, ptep); @@ -383,7 +385,7 @@ if ( pmd_end > end ) pmd_end = end; if ( !pmd_none( *pmd ) ) { - ptep = pte_offset( pmd, start ); + ptep = pte_offset_map( pmd, start ); do { if ( pte_val(*ptep) & _PAGE_HASHPTE ) flush_hash_page( context, start, ptep ); diff -Nru a/arch/s390/mm/init.c b/arch/s390/mm/init.c --- a/arch/s390/mm/init.c Thu Apr 17 15:25:14 2003 +++ b/arch/s390/mm/init.c Thu Apr 17 15:25:14 2003 @@ -47,6 +47,9 @@ int do_check_pgt_cache(int low, int high) { +#if 1 /* No quicklists in rmap */ + return 0; +#else int freed = 0; if(pgtable_cache_size > high) { do { @@ -65,6 +68,7 @@ } while(pgtable_cache_size > low); } return freed; +#endif } void show_mem(void) diff -Nru a/arch/s390/mm/ioremap.c b/arch/s390/mm/ioremap.c --- a/arch/s390/mm/ioremap.c Thu Apr 17 15:25:14 2003 +++ b/arch/s390/mm/ioremap.c Thu Apr 17 15:25:14 2003 @@ -54,7 +54,7 @@ if (address >= end) BUG(); do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); diff -Nru a/arch/s390x/mm/init.c b/arch/s390x/mm/init.c --- a/arch/s390x/mm/init.c Thu Apr 17 15:25:14 2003 +++ b/arch/s390x/mm/init.c Thu Apr 17 15:25:14 2003 @@ -45,96 +45,6 @@ pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); char empty_zero_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); -pmd_t *pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) -{ - unsigned long addr = (unsigned long) pgd; - unsigned long *pgd_slot = (unsigned long *) (addr & -8); - unsigned long offset = addr & 4; - pmd_t *new, *pmd2; - int i; - - if (offset == 0 && - ((*pgd_slot & _PGD_ENTRY_INV) != 0 || - (*pgd_slot & _PGD_ENTRY_LEN(2)) == 0)) { - /* Set lower pmd, upper pmd is empty. */ - *pgd_slot = __pa(pmd) | _PGD_ENTRY_MASK | - _PGD_ENTRY_OFF(0) | _PGD_ENTRY_LEN(1); - return pmd; - } - if (offset == 4 && - ((*pgd_slot & _PGD_ENTRY_INV) != 0 || - (*pgd_slot & _PGD_ENTRY_OFF(2)) != 0)) { - /* Lower pmd empty, set upper pmd. */ - *pgd_slot = (__pa(pmd) - 0x2000) | _PGD_ENTRY_MASK | - _PGD_ENTRY_OFF(2) | _PGD_ENTRY_LEN(3); - return pmd; - } - /* We have to enlarge the pmd to 16K if we arrive here. */ - new = (pmd_t *) __get_free_pages(GFP_KERNEL, 2); - if (new == NULL) { - pmd_free(pmd); - return NULL; - } - /* Set the PG_arch_1 bit on the first and the third pmd page - so that pmd_free_fast can recognize pmds that have been - allocated with an order 2 allocation. */ - set_bit(PG_arch_1, &virt_to_page(new)->flags); - set_bit(PG_arch_1, &virt_to_page(new+PTRS_PER_PMD)->flags); - /* Now copy the two pmds to the new memory area. */ - if (offset == 0) { - pmd2 = (pmd_t *)(*pgd_slot & PAGE_MASK) + PTRS_PER_PMD; - memcpy(new, pmd, sizeof(pmd_t)*PTRS_PER_PMD); - memcpy(new + PTRS_PER_PMD, pmd2, sizeof(pmd_t)*PTRS_PER_PMD); - } else { - pmd2 = (pmd_t *)(*pgd_slot & PAGE_MASK); - memcpy(new, pmd2, sizeof(pmd_t)*PTRS_PER_PMD); - memcpy(new + PTRS_PER_PMD, pmd, sizeof(pmd_t)*PTRS_PER_PMD); - } - *pgd_slot = __pa(new) | _PGD_ENTRY_MASK | - _PGD_ENTRY_OFF(0) | _PGD_ENTRY_LEN(3); - for (i = 0; i < PTRS_PER_PMD; i++) { - pmd_clear(pmd + i); - pmd_clear(pmd2 + i); - } - pmd_free(pmd); - pmd_free(pmd2); - return new; -} - -void pmd_free_order2(pmd_t *pmd) -{ - pmd_t *pmd2 = (pmd_t *) ((unsigned long) pmd ^ 8192); - - clear_bit(PG_arch_1, &virt_to_page(pmd)->flags); - if (test_bit(PG_arch_1, &virt_to_page(pmd2)->flags) == 0) { - /* The other pmd of the order 2 allocation has already - been freed. Now we can release the order 2 allocation. */ - free_pages((unsigned long) pmd & ~8192, 2); - } -} - -int do_check_pgt_cache(int low, int high) -{ - int freed = 0; - if(pgtable_cache_size > high) { - do { - if(pgd_quicklist) { - free_pgd_slow(get_pgd_fast()); - freed += 2; - } - if(pmd_quicklist) { - pmd_free_slow(pmd_alloc_one_fast(NULL, 0)); - freed += 2; - } - if(pte_quicklist) { - pte_free_slow(pte_alloc_one_fast(NULL, 0)); - freed += 1; - } - } while(pgtable_cache_size > low); - } - return freed; -} - void show_mem(void) { int i, total = 0,reserved = 0; @@ -181,9 +91,13 @@ void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned long pgdir_k = (__pa(swapper_pg_dir) & PAGE_MASK) | + _KERN_REGION_TABLE; static const int ssm_mask = 0x04000000L; unsigned long dma_pfn, address, end_mem; pgd_t * pg_dir; + pmd_t * pm_dir; + pte_t * pt_dir; int i,j,k; dma_pfn = MAX_DMA_ADDRESS >> PAGE_SHIFT; @@ -204,8 +118,7 @@ pg_dir = swapper_pg_dir; address = 0; end_mem = (unsigned long) __va(max_low_pfn*PAGE_SIZE); - for (i = 0 ; i < PTRS_PER_PGD/2 ; i++, pg_dir += 2) { - pmd_t *pm_dir; + for (i = 0 ; i < PTRS_PER_PGD ; i++,pg_dir++) { if (address >= end_mem) { pgd_clear(pg_dir); @@ -213,11 +126,9 @@ } pm_dir = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE*4); - *((unsigned long *) pg_dir) = __pa(pm_dir) | _PGD_ENTRY_MASK | - _PGD_ENTRY_LEN(3) | _PGD_ENTRY_OFF(0); + pgd_populate(&init_mm, pg_dir, pm_dir); - for (j = 0 ; j < PTRS_PER_PMD*2 ; j++, pm_dir++) { - pte_t *pt_dir; + for (j = 0 ; j < PTRS_PER_PMD ; j++,pm_dir++) { if (address >= end_mem) { pmd_clear(pm_dir); @@ -225,7 +136,7 @@ } pt_dir = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - pmd_populate(&init_mm, pm_dir, pt_dir); + pmd_populate_kernel(&init_mm, pm_dir, pt_dir); for (k = 0 ; k < PTRS_PER_PTE ; k++,pt_dir++) { pte_t pte = mk_pte_phys(address, PAGE_KERNEL); @@ -249,8 +160,8 @@ "lctlg 7,7,%0\n\t" "lctlg 13,13,%0\n\t" "ssm %1" - : :"m" (__pa(swapper_pg_dir) | _KERN_REGION_TABLE), - "m" (ssm_mask)); + : :"m" (pgdir_k), "m" (ssm_mask)); + local_flush_tlb(); return; diff -Nru a/arch/s390x/mm/ioremap.c b/arch/s390x/mm/ioremap.c --- a/arch/s390x/mm/ioremap.c Thu Apr 17 15:25:14 2003 +++ b/arch/s390x/mm/ioremap.c Thu Apr 17 15:25:14 2003 @@ -54,7 +54,7 @@ if (address >= end) BUG(); do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); diff -Nru a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c --- a/arch/x86_64/mm/fault.c Thu Apr 17 15:25:14 2003 +++ b/arch/x86_64/mm/fault.c Thu Apr 17 15:25:14 2003 @@ -341,7 +341,7 @@ pmd = pmd_offset(pgd, address); if (!pmd_present(*pmd)) goto bad_area_nosemaphore; - pte = pte_offset(pmd, address); + pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) goto bad_area_nosemaphore; diff -Nru a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c --- a/arch/x86_64/mm/init.c Thu Apr 17 15:25:14 2003 +++ b/arch/x86_64/mm/init.c Thu Apr 17 15:25:14 2003 @@ -142,12 +142,12 @@ if (pmd_none(*pmd)) { pte = (pte_t *) spp_getpage(); set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); - if (pte != pte_offset(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) { printk("PAGETABLE BUG #02!\n"); return; } } - pte = pte_offset(pmd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, mk_pte_phys(phys, prot)); /* diff -Nru a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c --- a/arch/x86_64/mm/ioremap.c Thu Apr 17 15:25:14 2003 +++ b/arch/x86_64/mm/ioremap.c Thu Apr 17 15:25:14 2003 @@ -49,7 +49,7 @@ if (address >= end) BUG(); do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); diff -Nru a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c --- a/arch/x86_64/mm/pageattr.c Thu Apr 17 15:25:14 2003 +++ b/arch/x86_64/mm/pageattr.c Thu Apr 17 15:25:14 2003 @@ -23,7 +23,7 @@ if ((pmd_val(*pmd) & PAGE_LARGE) == PAGE_LARGE) return (pte_t *)pmd; - return pte_offset(pmd, address); + return pte_offset_kernel(pmd, address); } static struct page *split_large_page(unsigned long address, pgprot_t prot) diff -Nru a/drivers/char/drm/drm_proc.h b/drivers/char/drm/drm_proc.h --- a/drivers/char/drm/drm_proc.h Thu Apr 17 15:25:14 2003 +++ b/drivers/char/drm/drm_proc.h Thu Apr 17 15:25:14 2003 @@ -448,7 +448,7 @@ for (i = vma->vm_start; i < vma->vm_end; i += PAGE_SIZE) { pgd = pgd_offset(vma->vm_mm, i); pmd = pmd_offset(pgd, i); - pte = pte_offset(pmd, i); + pte = pte_offset_map(pmd, i); if (pte_present(*pte)) { address = __pa(pte_page(*pte)) + (i & (PAGE_SIZE-1)); @@ -464,6 +464,7 @@ } else { DRM_PROC_PRINT(" 0x%08lx\n", i); } + pte_unmap(pte); } #endif } diff -Nru a/drivers/char/mem.c b/drivers/char/mem.c --- a/drivers/char/mem.c Thu Apr 17 15:25:14 2003 +++ b/drivers/char/mem.c Thu Apr 17 15:25:14 2003 @@ -542,7 +542,7 @@ pmd = pmd_offset(pgd, kaddr); if (pmd_none(*pmd) || pmd_bad(*pmd)) goto out; - ptep = pte_offset(pmd, kaddr); + ptep = pte_offset_kernel(pmd, kaddr); if (!ptep) goto out; pte = *ptep; diff -Nru a/drivers/sgi/char/graphics.c b/drivers/sgi/char/graphics.c --- a/drivers/sgi/char/graphics.c Thu Apr 17 15:25:14 2003 +++ b/drivers/sgi/char/graphics.c Thu Apr 17 15:25:14 2003 @@ -219,6 +219,7 @@ { pgd_t *pgd; pmd_t *pmd; pte_t *pte; int board = GRAPHICS_CARD (vma->vm_dentry->d_inode->i_rdev); + struct page *page; unsigned long virt_add, phys_add; @@ -247,8 +248,10 @@ pgd = pgd_offset(current->mm, address); pmd = pmd_offset(pgd, address); - pte = pte_offset(pmd, address); - return pte_page(*pte); + pte = pte_offset_map(pmd, address); + page = pte_page(*pte); + pte_unmap(pte); + return page; } /* diff -Nru a/fs/buffer.c b/fs/buffer.c --- a/fs/buffer.c Thu Apr 17 15:25:14 2003 +++ b/fs/buffer.c Thu Apr 17 15:25:14 2003 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -79,6 +80,8 @@ static int nr_buffers_type[NR_LIST]; static unsigned long size_buffers_type[NR_LIST]; +static LIST_HEAD(buffer_lru); +static int nr_used_buffer_heads; static struct buffer_head * unused_list; static int nr_unused_buffer_heads; static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; @@ -1134,6 +1137,8 @@ { if (unlikely(buffer_attached(bh))) BUG(); + list_del(&bh->lru); + nr_used_buffer_heads--; if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { kmem_cache_free(bh_cachep, bh); } else { @@ -1169,6 +1174,8 @@ bh = unused_list; unused_list = bh->b_next_free; nr_unused_buffer_heads--; + list_add(&bh->lru, &buffer_lru); + nr_used_buffer_heads++; spin_unlock(&unused_list_lock); return bh; } @@ -1181,6 +1188,10 @@ if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) { bh->b_blocknr = -1; bh->b_this_page = NULL; + spin_lock(&unused_list_lock); + list_add(&bh->lru, &buffer_lru); + nr_used_buffer_heads++; + spin_unlock(&unused_list_lock); return bh; } @@ -1193,6 +1204,8 @@ bh = unused_list; unused_list = bh->b_next_free; nr_unused_buffer_heads--; + list_add(&bh->lru, &buffer_lru); + nr_used_buffer_heads++; spin_unlock(&unused_list_lock); return bh; } @@ -1335,6 +1348,45 @@ } /* + * try_to_reclaim_buffers - get rid of buffer heads when the VM needs space + * @priority - reclaim priority + * @gfp_mask - page freeing mask + * + * We rotate the buffers on the buffer_lru list, trying to reclaim + * them. + */ +int try_to_reclaim_buffers(int priority, unsigned int gfp_mask) +{ + int todo = nr_used_buffer_heads >> priority; + struct list_head * entry; + struct buffer_head * bh; + struct page * page; + int reclaimed = 0; + + spin_lock(&unused_list_lock); + while (todo-- && !list_empty(&buffer_lru)) { + entry = buffer_lru.prev; + list_move(entry, &buffer_lru); + bh = list_entry(entry, struct buffer_head, lru); + + page = bh->b_page; + if (TryLockPage(page)) + continue; + page_cache_get(page); + spin_unlock(&unused_list_lock); + + reclaimed += try_to_release_page(page, gfp_mask); + + UnlockPage(page); + page_cache_release(page); + spin_lock(&unused_list_lock); + } + spin_unlock(&unused_list_lock); + + return reclaimed; +} + +/* * We don't have to release all buffers here, but * we have to be sure that no dirty buffer is left * and no IO is going on (no buffer is locked), because @@ -2544,63 +2596,23 @@ return 1; } -/* - * The first time the VM inspects a page which has locked buffers, it - * will just mark it as needing waiting upon on the scan of the page LRU. - * BH_Wait_IO is used for this. - * - * The second time the VM visits the page, if it still has locked - * buffers, it is time to start writing them out. (BH_Wait_IO was set). - * - * The third time the VM visits the page, if the I/O hasn't completed - * then it's time to wait upon writeout. BH_Lock and BH_Launder are - * used for this. - * - * There is also the case of buffers which were locked by someone else - * - write(2) callers, bdflush, etc. There can be a huge number of these - * and we don't want to just skip them all and fail the page allocation. - * We want to be able to wait on these buffers as well. - * - * The BH_Launder bit is set in submit_bh() to indicate that I/O is - * underway against the buffer, doesn't matter who started it - we know - * that the buffer will eventually come unlocked, and so it's safe to - * wait on it. - * - * The caller holds the page lock and the caller will free this page - * into current->local_page, so by waiting on the page's buffers the - * caller is guaranteed to obtain this page. - * - * sync_page_buffers() will sort-of return true if all the buffers - * against this page are freeable, so try_to_free_buffers() should - * try to free the page's buffers a second time. This is a bit - * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly. - */ -static int sync_page_buffers(struct buffer_head *head) +static void sync_page_buffers(struct buffer_head *head) { struct buffer_head * bh = head; - int tryagain = 1; do { if (!buffer_dirty(bh) && !buffer_locked(bh)) continue; /* Don't start IO first time around.. */ - if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) { - tryagain = 0; + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) continue; - } - /* Second time through we start actively writing out.. */ - if (test_and_set_bit(BH_Lock, &bh->b_state)) { - if (unlikely(!buffer_launder(bh))) { - tryagain = 0; - continue; - } - wait_on_buffer(bh); - tryagain = 1; + /* If we cannot lock the buffer just skip it. */ + if (test_and_set_bit(BH_Lock, &bh->b_state)) continue; - } + /* Second time through we start actively writing out.. */ if (!atomic_set_buffer_clean(bh)) { unlock_buffer(bh); continue; @@ -2610,10 +2622,9 @@ get_bh(bh); bh->b_end_io = end_buffer_io_sync; submit_bh(WRITE, bh); - tryagain = 0; } while ((bh = bh->b_this_page) != head); - return tryagain; + return; } /* @@ -2637,7 +2648,6 @@ { struct buffer_head * tmp, * bh = page->buffers; -cleaned_buffers_try_again: spin_lock(&lru_list_lock); write_lock(&hash_table_lock); tmp = bh; @@ -2680,15 +2690,9 @@ write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); gfp_mask = pf_gfp_mask(gfp_mask); - if (gfp_mask & __GFP_IO) { - if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) { - if (sync_page_buffers(bh)) { - /* no IO or waiting next time */ - gfp_mask = 0; - goto cleaned_buffers_try_again; - } - } - } + if ((gfp_mask & __GFP_IO) && + ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page))) + sync_page_buffers(bh); if (balance_dirty_state() >= 0) wakeup_bdflush(); return 0; @@ -2953,9 +2957,35 @@ break; ndirty -= NRSYNC; } - if (ndirty > 0 || bdflush_stop()) + if (ndirty > 0 || bdflush_stop()) { + run_task_queue(&tq_disk); interruptible_sleep_on(&bdflush_wait); + } + } +} + + +/* + * Do some IO post-processing here!!! + */ +void do_io_postprocessing(void) +{ + int i; + struct buffer_head *bh, *next; + + spin_lock(&lru_list_lock); + bh = lru_list[BUF_LOCKED]; + if (bh) { + for (i = nr_buffers_type[BUF_LOCKED]; i-- > 0; bh = next) { + next = bh->b_next_free; + + if (!buffer_locked(bh)) + __refile_buffer(bh); + else + break; + } } + spin_unlock(&lru_list_lock); } /* @@ -3009,6 +3039,7 @@ #ifdef DEBUG printk(KERN_DEBUG "kupdate() activated...\n"); #endif + do_io_postprocessing(); sync_old_buffers(); run_task_queue(&tq_disk); } diff -Nru a/fs/exec.c b/fs/exec.c --- a/fs/exec.c Thu Apr 17 15:25:14 2003 +++ b/fs/exec.c Thu Apr 17 15:25:14 2003 @@ -286,33 +286,45 @@ pgd_t * pgd; pmd_t * pmd; pte_t * pte; + struct pte_chain * pte_chain; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); pgd = pgd_offset(tsk->mm, address); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto out_nounlock; + spin_lock(&tsk->mm->page_table_lock); pmd = pmd_alloc(tsk->mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc(tsk->mm, pmd, address); + pte = pte_alloc_map(tsk->mm, pmd, address); if (!pte) goto out; - if (!pte_none(*pte)) + if (!pte_none(*pte)) { + pte_unmap(pte); goto out; + } lru_cache_add(page); flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + pte_chain = page_add_rmap(page, pte, pte_chain); tsk->mm->rss++; + pte_unmap(pte); spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ + pte_chain_free(pte_chain); return; out: spin_unlock(&tsk->mm->page_table_lock); +out_nounlock: __free_page(page); force_sig(SIGKILL, tsk); + pte_chain_free(pte_chain); return; } diff -Nru a/fs/inode.c b/fs/inode.c --- a/fs/inode.c Thu Apr 17 15:25:14 2003 +++ b/fs/inode.c Thu Apr 17 15:25:14 2003 @@ -49,7 +49,8 @@ * other linked list is the "type" list: * "in_use" - valid inode, i_count > 0, i_nlink > 0 * "dirty" - as "in_use" but also dirty - * "unused" - valid inode, i_count = 0 + * "unused" - valid inode, i_count = 0, no pages in the pagecache + * "unused_pagecache" - valid inode, i_count = 0 but has pages in the pagecache * * A "dirty" list is maintained for each super block, * allowing for low-overhead inode sync() operations. @@ -57,6 +58,7 @@ static LIST_HEAD(inode_in_use); static LIST_HEAD(inode_unused); +static LIST_HEAD(inode_unused_pagecache); static struct list_head *inode_hashtable; static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ @@ -254,6 +256,37 @@ inodes_stat.nr_unused--; } +static inline void __refile_inode(struct inode *inode) +{ + struct list_head *to; + + if (inode->i_state & I_FREEING) + return; + if (list_empty(&inode->i_hash)) + return; + + if (inode->i_state & I_DIRTY) + to = &inode->i_sb->s_dirty; + else if (atomic_read(&inode->i_count)) + to = &inode_in_use; + else if (inode->i_data.nrpages) + to = &inode_unused_pagecache; + else + to = &inode_unused; + list_del(&inode->i_list); + list_add(&inode->i_list, to); +} + +void refile_inode(struct inode *inode) +{ +/* if (in_interrupt()) + BUG(); */ + if (!inode) return; + spin_lock(&inode_lock); + __refile_inode(inode); + spin_unlock(&inode_lock); +} + static inline void __sync_one(struct inode *inode, int sync) { unsigned dirty; @@ -280,17 +313,8 @@ spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; - if (!(inode->i_state & I_FREEING)) { - struct list_head *to; - if (inode->i_state & I_DIRTY) - to = &inode->i_sb->s_dirty; - else if (atomic_read(&inode->i_count)) - to = &inode_in_use; - else - to = &inode_unused; - list_del(&inode->i_list); - list_add(&inode->i_list, to); - } + if (!(inode->i_state & I_FREEING)) + __refile_inode(inode); wake_up(&inode->i_wait); } @@ -659,6 +683,7 @@ spin_lock(&inode_lock); busy = invalidate_list(&inode_in_use, sb, &throw_away); busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&inode_unused_pagecache, sb, &throw_away); busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); spin_unlock(&inode_lock); @@ -734,8 +759,7 @@ if (atomic_read(&inode->i_count)) continue; list_del(tmp); - list_del(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_hash); + list_del_init(&inode->i_hash); list_add(tmp, freeable); inode->i_state |= I_FREEING; count++; @@ -1063,10 +1087,8 @@ BUG(); } else { if (!list_empty(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_LOCK))) { - list_del(&inode->i_list); - list_add(&inode->i_list, &inode_unused); - } + if (!(inode->i_state & (I_DIRTY|I_LOCK))) + __refile_inode(inode); inodes_stat.nr_unused++; spin_unlock(&inode_lock); if (!sb || (sb->s_flags & MS_ACTIVE)) @@ -1221,6 +1243,11 @@ remove_inode_dquot_ref(inode, type, &tofree_head); } list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused_pagecache) { inode = list_entry(act_head, struct inode, i_list); if (inode->i_sb == sb && IS_QUOTAINIT(inode)) remove_inode_dquot_ref(inode, type, &tofree_head); diff -Nru a/fs/proc/array.c b/fs/proc/array.c --- a/fs/proc/array.c Thu Apr 17 15:25:14 2003 +++ b/fs/proc/array.c Thu Apr 17 15:25:14 2003 @@ -399,7 +399,7 @@ static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total) { - pte_t * pte; + pte_t * pte, *mapping; unsigned long end; if (pmd_none(*pmd)) @@ -409,7 +409,7 @@ pmd_clear(pmd); return; } - pte = pte_offset(pmd, address); + mapping = pte = pte_offset_map(pmd, address); address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -434,6 +434,7 @@ if (page_count(pte_page(page)) > 1) ++*shared; } while (address < end); + pte_unmap(mapping); } static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size, diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c Thu Apr 17 15:25:14 2003 +++ b/fs/proc/proc_misc.c Thu Apr 17 15:25:14 2003 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -186,7 +187,12 @@ "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" "Active: %8u kB\n" - "Inactive: %8u kB\n" + "ActiveAnon: %8u kB\n" + "ActiveCache: %8u kB\n" + "Inact_dirty: %8u kB\n" + "Inact_laundry:%8u kB\n" + "Inact_clean: %8u kB\n" + "Inact_target: %8u kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -199,8 +205,13 @@ K(i.bufferram), K(pg_size - swapper_space.nrpages), K(swapper_space.nrpages), - K(nr_active_pages), - K(nr_inactive_pages), + K(nr_active_anon_pages()) + K(nr_active_cache_pages()), + K(nr_active_anon_pages()), + K(nr_active_cache_pages()), + K(nr_inactive_dirty_pages()), + K(nr_inactive_laundry_pages()), + K(nr_inactive_clean_pages()), + K(inactive_target()), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), diff -Nru a/include/asm-alpha/rmap.h b/include/asm-alpha/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-alpha/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _ALPHA_RMAP_H +#define _ALPHA_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-arm/proc-armv/rmap.h b/include/asm-arm/proc-armv/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-arm/proc-armv/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,72 @@ +#ifndef _ARMV_RMAP_H +#define _ARMV_RMAP_H +/* + * linux/include/asm-arm/proc-armv/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * + * We use the struct page of the page table page to find a pointer + * to an array of two 'struct arm_rmap_info's, one for each of the + * two page tables in each page. + * + * - rmi->mm points to the process' mm_struct + * - rmi->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +struct arm_rmap_info { + struct mm_struct *mm; + unsigned long index; +}; + +static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + rmi->mm = mm; + rmi->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + rmi->mm = NULL; + rmi->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + return rmi->mm; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + unsigned long low_bits; + + if (((unsigned long)ptep)&2048) + rmi++; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return rmi->index + low_bits; +} + +#endif /* _ARMV_RMAP_H */ diff -Nru a/include/asm-arm/rmap.h b/include/asm-arm/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-arm/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,6 @@ +#ifndef _ARM_RMAP_H +#define _ARM_RMAP_H + +#include + +#endif /* _ARM_RMAP_H */ diff -Nru a/include/asm-cris/rmap.h b/include/asm-cris/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-cris/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _CRIS_RMAP_H +#define _CRIS_RMAP_H + +/* nothing to see, move along :) */ +#include + +#endif diff -Nru a/include/asm-generic/rmap.h b/include/asm-generic/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-generic/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,90 @@ +#ifndef _GENERIC_RMAP_H +#define _GENERIC_RMAP_H +/* + * linux/include/asm-generic/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * this version should work for most architectures with a + * 'normal' page table layout. + * + * We use the struct page of the page table page to find out + * the process and full address of a page table entry: + * - page->mapping points to the process' mm_struct + * - page->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + * + * For CONFIG_HIGHPTE, we need to represent the address of a pte in a + * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE + * bits and is then ORed with the byte offset of the pte within its page. + * + * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for + * the offset. + * + * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for + * the offset. + */ +#include + +static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) +{ +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(struct page * page) +{ + page->mapping = NULL; + page->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = kmap_atomic_to_page(ptep); + + return (struct mm_struct *) page->mapping; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = kmap_atomic_to_page(ptep); + unsigned long low_bits; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return page->index + low_bits; +} + +#if CONFIG_HIGHPTE +static inline pte_addr_t ptep_to_paddr(pte_t *ptep) +{ + pte_addr_t paddr; + paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; + return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); +} +#else +static inline pte_addr_t ptep_to_paddr(pte_t *ptep) +{ + return (pte_addr_t)ptep; +} +#endif + +#ifndef CONFIG_HIGHPTE +static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) +{ + return (pte_t *)pte_paddr; +} + +static inline void rmap_ptep_unmap(pte_t *pte) +{ + return; +} +#endif + +#endif /* _GENERIC_RMAP_H */ diff -Nru a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h --- a/include/asm-i386/fixmap.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/fixmap.h Thu Apr 17 15:25:14 2003 @@ -101,6 +101,7 @@ #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) extern void __this_fixmap_does_not_exist(void); @@ -124,6 +125,12 @@ __this_fixmap_does_not_exist(); return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); } #endif diff -Nru a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h --- a/include/asm-i386/highmem.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/highmem.h Thu Apr 17 15:25:14 2003 @@ -127,6 +127,20 @@ #endif } +static inline struct page *kmap_atomic_to_page(void *ptr) +{ + unsigned long idx, vaddr = (unsigned long)ptr; + pte_t *pte; + + if (vaddr < FIXADDR_START) + return virt_to_page(ptr); + + idx = virt_to_fix(vaddr); + pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + return pte_page(*pte); +} + + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff -Nru a/include/asm-i386/kmap_types.h b/include/asm-i386/kmap_types.h --- a/include/asm-i386/kmap_types.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/kmap_types.h Thu Apr 17 15:25:14 2003 @@ -8,6 +8,9 @@ KM_USER0, KM_USER1, KM_BH_IRQ, + KM_PTE0, + KM_PTE1, + KM_PTE2, KM_TYPE_NR }; diff -Nru a/include/asm-i386/page.h b/include/asm-i386/page.h --- a/include/asm-i386/page.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/page.h Thu Apr 17 15:25:14 2003 @@ -131,7 +131,13 @@ #define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE)) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define virt_to_page(kaddr) (mem_map + (__pa(kaddr) >> PAGE_SHIFT)) +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) +#ifndef CONFIG_DISCONTIGMEM +#define pfn_to_page(pfn) (mem_map + (pfn)) +#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif /* !CONFIG_DISCONTIGMEM */ +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define VALID_PAGE(page) ((page - mem_map) < max_mapnr) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ diff -Nru a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h --- a/include/asm-i386/pgalloc.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/pgalloc.h Thu Apr 17 15:25:14 2003 @@ -5,143 +5,47 @@ #include #include #include +#include /* for struct page */ #define pgd_quicklist (current_cpu_data.pgd_quick) #define pmd_quicklist (current_cpu_data.pmd_quick) #define pte_quicklist (current_cpu_data.pte_quick) #define pgtable_cache_size (current_cpu_data.pgtable_cache_sz) -#define pmd_populate(mm, pmd, pte) \ +#define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) +{ + set_pmd(pmd, __pmd(_PAGE_TABLE + + ((unsigned long long)page_to_pfn(pte) << + (unsigned long long) PAGE_SHIFT))); +} /* * Allocate and free page tables. */ -#if defined (CONFIG_X86_PAE) -/* - * We can't include here, thus these uglinesses. - */ -struct kmem_cache_s; - -extern struct kmem_cache_s *pae_pgd_cachep; -extern void *kmem_cache_alloc(struct kmem_cache_s *, int); -extern void kmem_cache_free(struct kmem_cache_s *, void *); - - -static inline pgd_t *get_pgd_slow(void) -{ - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); - - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); - } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; -} - -#else - -static inline pgd_t *get_pgd_slow(void) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); - - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -} - -#endif /* CONFIG_X86_PAE */ - -static inline pgd_t *get_pgd_fast(void) -{ - unsigned long *ret; +extern pgd_t *pgd_alloc(struct mm_struct *); +extern void pgd_free(pgd_t *pgd); - if ((ret = pgd_quicklist) != NULL) { - pgd_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - pgtable_cache_size--; - } else - ret = (unsigned long *)get_pgd_slow(); - return (pgd_t *)ret; -} +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); -static inline void free_pgd_fast(pgd_t *pgd) -{ - *(unsigned long *)pgd = (unsigned long) pgd_quicklist; - pgd_quicklist = (unsigned long *) pgd; - pgtable_cache_size++; -} +#define pte_alloc_one_fast(mm, address) (0) +#define pmd_alloc_one_fast(mm, address) (0) -static inline void free_pgd_slow(pgd_t *pgd) +static inline void pte_free_kernel(pte_t *pte) { -#if defined(CONFIG_X86_PAE) - int i; - - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); -#else - free_page((unsigned long)pgd); -#endif -} - -static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte; - - pte = (pte_t *) __get_free_page(GFP_KERNEL); - if (pte) - clear_page(pte); - return pte; -} - -static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, - unsigned long address) -{ - unsigned long *ret; - - if ((ret = (unsigned long *)pte_quicklist) != NULL) { - pte_quicklist = (unsigned long *)(*ret); - ret[0] = ret[1]; - pgtable_cache_size--; - } - return (pte_t *)ret; + free_page((unsigned long)pte); } -static inline void pte_free_fast(pte_t *pte) +static inline void pte_free(struct page *pte) { - *(unsigned long *)pte = (unsigned long) pte_quicklist; - pte_quicklist = (unsigned long *) pte; - pgtable_cache_size++; + __free_page(pte); } -static __inline__ void pte_free_slow(pte_t *pte) -{ - free_page((unsigned long)pte); -} -#define pte_free(pte) pte_free_fast(pte) -#define pgd_free(pgd) free_pgd_slow(pgd) -#define pgd_alloc(mm) get_pgd_fast() +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is @@ -149,11 +53,9 @@ * (In the PAE case we free the pmds as part of the pgd.) */ -#define pmd_alloc_one_fast(mm, addr) ({ BUG(); ((pmd_t *)1); }) #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free_slow(x) do { } while (0) -#define pmd_free_fast(x) do { } while (0) #define pmd_free(x) do { } while (0) +#define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() extern int do_check_pgt_cache(int, int); diff -Nru a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h --- a/include/asm-i386/pgtable-2level.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/pgtable-2level.h Thu Apr 17 15:25:14 2003 @@ -60,6 +60,10 @@ #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) (mem_map+((unsigned long)(((x).pte_low >> PAGE_SHIFT)))) #define pte_none(x) (!(x).pte_low) -#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) +#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) +#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) + +#define __mk_pte(nr,prot) pfn_pte(nr,prot) #endif /* _I386_PGTABLE_2LEVEL_H */ diff -Nru a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h --- a/include/asm-i386/pgtable-3level.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/pgtable-3level.h Thu Apr 17 15:25:14 2003 @@ -89,10 +89,12 @@ return a.pte_low == b.pte_low && a.pte_high == b.pte_high; } -#define pte_page(x) (mem_map+(((x).pte_low >> PAGE_SHIFT) | ((x).pte_high << (32 - PAGE_SHIFT)))) +#define pte_page(x) pfn_to_page(pte_pfn(x)) #define pte_none(x) (!(x).pte_low && !(x).pte_high) +#define pte_pfn(x) (((x).pte_low >> PAGE_SHIFT) | ((x).pte_high << (32 - PAGE_SHIFT))) -static inline pte_t __mk_pte(unsigned long page_nr, pgprot_t pgprot) +#define __mk_pte(nr,prot) pfn_pte(nr,prot) +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { pte_t pte; @@ -100,5 +102,12 @@ pte.pte_low = (page_nr << PAGE_SHIFT) | pgprot_val(pgprot); return pte; } + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); +} + +extern struct kmem_cache_s *pae_pgd_cachep; #endif /* _I386_PGTABLE_3LEVEL_H */ diff -Nru a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h --- a/include/asm-i386/pgtable.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-i386/pgtable.h Thu Apr 17 15:25:14 2003 @@ -320,9 +320,13 @@ #define page_pte(page) page_pte_prot(page, __pgprot(0)) -#define pmd_page(pmd) \ +#define pmd_page_kernel(pmd) \ ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +#ifndef CONFIG_DISCONTIGMEM +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#endif /* !CONFIG_DISCONTIGMEM */ + /* to find an entry in a page-table-directory. */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) @@ -339,8 +343,35 @@ /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset(dir, address) ((pte_t *) pmd_page(*(dir)) + \ - __pte_offset(address)) +#define pte_offset_kernel(dir, address) \ + ((pte_t *) pmd_page_kernel(*(dir)) + __pte_offset(address)) + +#if defined(CONFIG_HIGHPTE) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address)) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address)) +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) +#else +#define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) +#endif + +#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) +typedef u32 pte_addr_t; +#endif + +#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) +typedef u64 pte_addr_t; +#endif + +#if !defined(CONFIG_HIGHPTE) +typedef pte_t *pte_addr_t; +#endif /* * The i386 doesn't have any external MMU info: the kernel page diff -Nru a/include/asm-i386/rmap.h b/include/asm-i386/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-i386/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,21 @@ +#ifndef _I386_RMAP_H +#define _I386_RMAP_H + +/* nothing to see, move along */ +#include + +#ifdef CONFIG_HIGHPTE +static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) +{ + unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); + unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; + return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); +} + +static inline void rmap_ptep_unmap(pte_t *pte) +{ + kunmap_atomic(pte, KM_PTE2); +} +#endif + +#endif diff -Nru a/include/asm-ia64/rmap.h b/include/asm-ia64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-ia64/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _IA64_RMAP_H +#define _IA64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-m68k/rmap.h b/include/asm-m68k/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-m68k/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _M68K_RMAP_H +#define _M68K_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-mips/rmap.h b/include/asm-mips/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-mips/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _MIPS_RMAP_H +#define _MIPS_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-mips64/rmap.h b/include/asm-mips64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-mips64/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _MIPS64_RMAP_H +#define _MIPS64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-ppc/rmap.h b/include/asm-ppc/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-ppc/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,9 @@ +#ifndef _PPC_RMAP_H +#define _PPC_RMAP_H + +/* PPC calls pte_alloc() before mem_map[] is setup ... */ +#define BROKEN_PPC_PTE_ALLOC_ONE + +#include + +#endif diff -Nru a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h --- a/include/asm-ppc64/page.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-ppc64/page.h Thu Apr 17 15:25:14 2003 @@ -234,6 +234,17 @@ #define __a2p(x) ((void *) absolute_to_phys(x)) #define __a2v(x) ((void *) __va(absolute_to_phys(x))) +#ifdef CONFIG_DISCONTIGMEM +#define page_to_pfn(page) discontigmem_page_to_pfn(page) +#define pfn_to_page(pfn) discontigmem_pfn_to_page(pfn) +#define pfn_valid(pfn) discontigmem_pfn_valid(pfn) +#else +#define pfn_to_page(pfn) (mem_map + (pfn)) +#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif + + #define virt_to_page(kaddr) (mem_map+(__pa((unsigned long)kaddr) >> PAGE_SHIFT)) #define VALID_PAGE(page) ((page - mem_map) < max_mapnr) diff -Nru a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h --- a/include/asm-ppc64/pgalloc.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-ppc64/pgalloc.h Thu Apr 17 15:25:14 2003 @@ -80,6 +80,36 @@ return pmd; } +#define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte) +#define pmd_populate(mm, pmd, pte_page) \ + pmd_populate_kernel(mm, pmd, page_address(pte_page)) + +static inline pte_t * +pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +{ + int count = 0; + pte_t *pte; + + do { + pte = (pte_t *)__get_free_page(GFP_KERNEL); + if (pte) + clear_page(pte); + else { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ); + } + } while (!pte && (count++ < 10)); + + return pte; +} + +static inline void +pte_free_kernel(pte_t *pte) +{ + free_page((unsigned long)pte); +} + + static inline void pmd_free (pmd_t *pmd) { @@ -88,39 +118,20 @@ ++pgtable_cache_size; } -#define pmd_populate(MM, PMD, PTE) pmd_set(PMD, PTE) +#define pte_alloc_one_fast(mm, address) (0) -static inline pte_t* -pte_alloc_one_fast (struct mm_struct *mm, unsigned long addr) +static inline struct page * +pte_alloc_one(struct mm_struct *mm, unsigned long address) { - unsigned long *ret = (unsigned long *)pte_quicklist; - - if (ret != NULL) { - pte_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - --pgtable_cache_size; - } - return (pte_t *)ret; -} + pte_t *pte = pte_alloc_one_kernel(mm, address); + if (pte) + return virt_to_page(pte); -static inline pte_t* -pte_alloc_one (struct mm_struct *mm, unsigned long addr) -{ - pte_t *pte = (pte_t *) __get_free_page(GFP_KERNEL); - - if (pte != NULL) - clear_page(pte); - return pte; + return NULL; } -static inline void -pte_free (pte_t *pte) -{ - *(unsigned long *)pte = (unsigned long) pte_quicklist; - pte_quicklist = (unsigned long *) pte; - ++pgtable_cache_size; -} +#define pte_free(pte_page) pte_free_kernel(page_address(pte_page)) extern int do_check_pgt_cache(int, int); diff -Nru a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h --- a/include/asm-ppc64/pgtable.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-ppc64/pgtable.h Thu Apr 17 15:25:14 2003 @@ -196,7 +196,8 @@ #define pmd_bad(pmd) ((pmd_val(pmd)) == 0) #define pmd_present(pmd) ((pmd_val(pmd)) != 0) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) -#define pmd_page(pmd) (__bpn_to_ba(pmd_val(pmd))) +#define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) +#define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) #define pgd_set(pgdp, pmdp) (pgd_val(*(pgdp)) = (__ba_to_bpn(pmdp))) #define pgd_none(pgd) (!pgd_val(pgd)) #define pgd_bad(pgd) ((pgd_val(pgd)) == 0) @@ -217,8 +218,13 @@ ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) /* Find an entry in the third-level page table.. */ -#define pte_offset(dir,addr) \ - ((pte_t *) pmd_page(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) +#define pte_offset_kernel(dir,addr) \ + ((pte_t *) pmd_page_kernel(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) + +#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_unmap(pte) do { } while(0) +#define pte_unmap_nested(pte) do { } while(0) /* to find an entry in a kernel page-table-directory */ /* This now only contains the vmalloc pages */ @@ -399,6 +405,8 @@ extern void build_valid_hpte(unsigned long vsid, unsigned long ea, unsigned long pa, pte_t * ptep, unsigned hpteflags, unsigned bolted ); + +typedef pte_t *pte_addr_t; /* Encode and de-code a swap entry */ #define SWP_TYPE(entry) (((entry).val >> 1) & 0x3f) diff -Nru a/include/asm-ppc64/rmap.h b/include/asm-ppc64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-ppc64/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,9 @@ +#ifndef _PPC64_RMAP_H +#define _PPC64_RMAP_H + +/* PPC calls pte_alloc() before mem_map[] is setup ... */ +#define BROKEN_PPC_PTE_ALLOC_ONE + +#include + +#endif diff -Nru a/include/asm-s390/page.h b/include/asm-s390/page.h --- a/include/asm-s390/page.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-s390/page.h Thu Apr 17 15:25:14 2003 @@ -121,6 +121,9 @@ #define PAGE_OFFSET 0x0UL #define __pa(x) (unsigned long)(x) #define __va(x) (void *)(x) +#define pfn_to_page(pfn) (mem_map + (pfn)) +#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) +#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_to_page(kaddr) (mem_map + (__pa(kaddr) >> PAGE_SHIFT)) #define VALID_PAGE(page) ((page - mem_map) < max_mapnr) diff -Nru a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h --- a/include/asm-s390/pgalloc.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-s390/pgalloc.h Thu Apr 17 15:25:14 2003 @@ -28,7 +28,7 @@ * if any. */ -extern __inline__ pgd_t* get_pgd_slow(void) +extern __inline__ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; int i; @@ -40,42 +40,11 @@ return ret; } -extern __inline__ pgd_t* get_pgd_fast(void) -{ - unsigned long *ret = pgd_quicklist; - - if (ret != NULL) { - pgd_quicklist = (unsigned long *)(*ret); - ret[0] = ret[1]; - pgtable_cache_size -= 2; - } - return (pgd_t *)ret; -} - -extern __inline__ pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd; - - pgd = get_pgd_fast(); - if (!pgd) - pgd = get_pgd_slow(); - return pgd; -} - -extern __inline__ void free_pgd_fast(pgd_t *pgd) -{ - *(unsigned long *)pgd = (unsigned long) pgd_quicklist; - pgd_quicklist = (unsigned long *) pgd; - pgtable_cache_size += 2; -} - -extern __inline__ void free_pgd_slow(pgd_t *pgd) +extern __inline__ void pgd_free(pgd_t *pgd) { free_pages((unsigned long) pgd, 1); } -#define pgd_free(pgd) free_pgd_fast(pgd) - /* * page middle directory allocation/free routines. * We don't use pmd cache, so these are dummy routines. This @@ -88,7 +57,7 @@ #define pmd_free_fast(x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() -extern inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) +extern inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { pmd_val(pmd[0]) = _PAGE_TABLE + __pa(pte); pmd_val(pmd[1]) = _PAGE_TABLE + __pa(pte+256); @@ -96,14 +65,25 @@ pmd_val(pmd[3]) = _PAGE_TABLE + __pa(pte+768); } +extern inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) +{ + pte_t *p = page_address(pte); + if (p == NULL) BUG(); + pmd_val(pmd[0]) = _PAGE_TABLE + __pa(p); + pmd_val(pmd[1]) = _PAGE_TABLE + __pa(p+256); + pmd_val(pmd[2]) = _PAGE_TABLE + __pa(p+512); + pmd_val(pmd[3]) = _PAGE_TABLE + __pa(p+768); +} + /* * page table entry allocation/free routines. */ -extern inline pte_t * pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) { pte_t *pte; int i; + /* XXX Riel retries this 10 times if get_free_page returns NULL */ pte = (pte_t *) __get_free_page(GFP_KERNEL); if (pte != NULL) { for (i=0; i < PTRS_PER_PTE; i++) @@ -112,41 +92,49 @@ return pte; } -extern __inline__ pte_t * -pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) +extern inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long a) { - unsigned long *ret = (unsigned long *) pte_quicklist; + struct page *pte; + pte_t *p; + int i; - if (ret != NULL) { - pte_quicklist = (unsigned long *)(*ret); - ret[0] = ret[1]; - pgtable_cache_size--; - } - return (pte_t *)ret; + /* XXX Riel retries this 10 times if alloc_pages returns NULL */ + pte = alloc_pages(GFP_KERNEL, 0); + if (pte != NULL) { + /* + * This is a pure cheating, using the fact that we + * are not a highmem architecture, regardles of .config + */ + p = page_address(pte); + if (p == NULL) BUG(); + for (i=0; i < PTRS_PER_PTE; i++) + pte_clear(p+i); + } + return pte; } -extern __inline__ void pte_free_fast(pte_t *pte) -{ - *(unsigned long *)pte = (unsigned long) pte_quicklist; - pte_quicklist = (unsigned long *) pte; - pgtable_cache_size++; -} +#define pte_alloc_one_fast(mm, address) (0) -extern __inline__ void pte_free_slow(pte_t *pte) +extern inline void pte_free_kernel(pte_t *pte) { free_page((unsigned long) pte); } -#define pte_free(pte) pte_free_fast(pte) +extern inline void pte_free(struct page *pte) +{ + __free_page(pte); +} extern int do_check_pgt_cache(int, int); +#if 0 /* P3 */ /* * This establishes kernel virtual mappings (e.g., as a result of a * vmalloc call). Since s390-esame uses a separate kernel page table, * there is nothing to do here... :) */ #define set_pgdir(addr,entry) do { } while(0) +#endif /* * TLB flushing: diff -Nru a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h --- a/include/asm-s390/pgtable.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-s390/pgtable.h Thu Apr 17 15:25:14 2003 @@ -454,8 +454,8 @@ #define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT))) -#define pmd_page(pmd) \ - ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +#define pmd_page(x) (mem_map+(unsigned long)((pmd_val(x) >> PAGE_SHIFT))) /* to find an entry in a page-table-directory */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) @@ -471,8 +471,17 @@ } /* Find an entry in the third-level page table.. */ -#define pte_offset(pmd, address) \ - ((pte_t *) (pmd_page(*pmd) + ((address>>10) & ((PTRS_PER_PTE-1)<<2)))) +#define __pte_offset(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) +#define pte_offset_kernel(pmd, address) \ + ((pte_t *) pmd_page_kernel(*pmd) + __pte_offset(address)) + +#define pte_offset_map(dir, address) \ + ((pte_t *) page_address(pmd_page(*(dir))) + __pte_offset(address)) +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) + +typedef u32 pte_addr_t; /* * A page-table entry has some bits we have to treat in a special way. diff -Nru a/include/asm-s390/rmap.h b/include/asm-s390/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-s390/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _S390_RMAP_H +#define _S390_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-s390x/page.h b/include/asm-s390x/page.h --- a/include/asm-s390x/page.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-s390x/page.h Thu Apr 17 15:25:14 2003 @@ -95,22 +95,14 @@ unsigned long pmd0; unsigned long pmd1; } pmd_t; -typedef unsigned int pgd_t; +typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd0) #define pmd_val1(x) ((x).pmd1) - -static inline unsigned long __pgd_val(pgd_t *pgdp) -{ - unsigned long addr = (unsigned long) pgdp; - unsigned long *pgd_slot = (unsigned long *) (addr & -8); - - return *pgd_slot + ((addr & 4) << 11); -} -#define pgd_val(pgd) __pgd_val(&(pgd)) - +#define __pgd_val(x) ((x)->pgd) /* Violation in our linux/mm.h P3 */ +#define pgd_val(x) ((x).pgd) #define pgprot_val(x) ((x).pgprot) #define __pte(x) ((pte_t) { (x) } ) @@ -127,6 +119,9 @@ #define PAGE_OFFSET 0x0UL #define __pa(x) (unsigned long)(x) #define __va(x) (void *)(x) +#define pfn_to_page(pfn) (mem_map + (pfn)) +#define page_to_pfn(page) ((page) - mem_map) +#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_to_page(kaddr) (mem_map + (__pa(kaddr) >> PAGE_SHIFT)) #define VALID_PAGE(page) ((page - mem_map) < max_mapnr) diff -Nru a/include/asm-s390x/pgalloc.h b/include/asm-s390x/pgalloc.h --- a/include/asm-s390x/pgalloc.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-s390x/pgalloc.h Thu Apr 17 15:25:14 2003 @@ -18,9 +18,6 @@ #include #include -#define pgd_quicklist (S390_lowcore.cpu_data.pgd_quick) -#define pmd_quicklist (S390_lowcore.cpu_data.pmd_quick) -#define pte_quicklist (S390_lowcore.cpu_data.pte_quick) #define pgtable_cache_size (S390_lowcore.cpu_data.pgtable_cache_sz) /* @@ -32,56 +29,28 @@ /* * page directory allocation/free routines. */ -extern __inline__ pgd_t *get_pgd_slow (void) +extern inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; - int i; + int i; - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, 1); + ret = (pgd_t *) __get_free_pages(GFP_KERNEL, 2); if (ret != NULL) - for (i = 0; i < PTRS_PER_PGD; i++) + for (i = 0; i < PTRS_PER_PGD; i++) pgd_clear(ret + i); return ret; } -extern __inline__ pgd_t *get_pgd_fast (void) +extern inline void pgd_free(pgd_t *pgd) { - unsigned long *ret = pgd_quicklist; - - if (ret != NULL) { - pgd_quicklist = (unsigned long *)(*ret); - ret[0] = ret[1]; - pgtable_cache_size -= 2; - } - return (pgd_t *) ret; -} - -extern __inline__ pgd_t *pgd_alloc (struct mm_struct *mm) -{ - pgd_t *pgd; - - pgd = get_pgd_fast(); - if (!pgd) - pgd = get_pgd_slow(); - return pgd; + free_pages((unsigned long) pgd, 2); } -extern __inline__ void free_pgd_fast (pgd_t *pgd) +extern inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) { - *(unsigned long *) pgd = (unsigned long) pgd_quicklist; - pgd_quicklist = (unsigned long *) pgd; - pgtable_cache_size += 2; + pgd_val(*pgd) = _PGD_ENTRY | __pa(pmd); } -extern __inline__ void free_pgd_slow (pgd_t *pgd) -{ - free_pages((unsigned long) pgd, 1); -} - -#define pgd_free(pgd) free_pgd_fast(pgd) - -extern pmd_t *pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd); - /* * page middle directory allocation/free routines. */ @@ -90,7 +59,7 @@ pmd_t *pmd; int i; - pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 1); + pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 2); if (pmd != NULL) { for (i=0; i < PTRS_PER_PMD; i++) pmd_clear(pmd+i); @@ -98,51 +67,39 @@ return pmd; } -extern __inline__ pmd_t * -pmd_alloc_one_fast(struct mm_struct *mm, unsigned long address) -{ - unsigned long *ret = (unsigned long *) pmd_quicklist; - - if (ret != NULL) { - pmd_quicklist = (unsigned long *)(*ret); - ret[0] = ret[1]; - pgtable_cache_size -= 2; - } - return (pmd_t *) ret; -} +#define pmd_alloc_one_fast(mm, address) (0) -extern void pmd_free_order2(pmd_t *); -extern __inline__ void pmd_free_fast (pmd_t *pmd) +extern inline void pmd_free(pmd_t *pmd) { - if (test_bit(PG_arch_1, &virt_to_page(pmd)->flags) == 0) { - *(unsigned long *) pmd = (unsigned long) pmd_quicklist; - pmd_quicklist = (unsigned long *) pmd; - pgtable_cache_size += 2; - } else - pmd_free_order2(pmd); + free_pages((unsigned long) pmd, 2); } -extern __inline__ void pmd_free_slow (pmd_t *pmd) +extern inline void +pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - free_pages((unsigned long) pmd, 1); + pmd_val(*pmd) = _PMD_ENTRY | __pa(pte); + pmd_val1(*pmd) = _PMD_ENTRY | __pa(pte+256); } -#define pmd_free(pmd) pmd_free_fast(pmd) - -extern inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) +extern inline void +pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { - pmd_val(*pmd) = _PMD_ENTRY | __pa(pte); - pmd_val1(*pmd) = _PMD_ENTRY | __pa(pte+256); + pte_t *p = page_address(pte); + if (p == NULL) BUG(); + pmd_val(*pmd) = _PMD_ENTRY | __pa(p); + pmd_val1(*pmd) = _PMD_ENTRY | __pa(p+256); } /* * page table entry allocation/free routines. */ -extern inline pte_t * pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +extern inline pte_t * +pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) { pte_t *pte; int i; + /* XXX Riel retries this 10 times if get_free_page returns NULL */ pte = (pte_t *) __get_free_page(GFP_KERNEL); if (pte != NULL) { for (i=0; i < PTRS_PER_PTE; i++) @@ -151,40 +108,34 @@ return pte; } -extern __inline__ pte_t* pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) +extern inline struct page * +pte_alloc_one(struct mm_struct *mm, unsigned long addr) { - unsigned long *ret = (unsigned long *) pte_quicklist; - - if (ret != NULL) { - pte_quicklist = (unsigned long *)(*ret); - ret[0] = ret[1]; - pgtable_cache_size--; - } - return (pte_t *)ret; + return virt_to_page(pte_alloc_one_kernel(mm, addr)); } -extern __inline__ void pte_free_fast (pte_t *pte) -{ - *(unsigned long *) pte = (unsigned long) pte_quicklist; - pte_quicklist = (unsigned long *) pte; - pgtable_cache_size++; -} +#define pte_alloc_one_fast(mm, address) (0) -extern __inline__ void pte_free_slow (pte_t *pte) +extern inline void pte_free_kernel(pte_t *pte) { free_page((unsigned long) pte); } -#define pte_free(pte) pte_free_fast(pte) +extern inline void pte_free(struct page *pte) +{ + __free_page(pte); +} -extern int do_check_pgt_cache (int, int); +#define do_check_pgt_cache(x, y) (0) /* No quicklists in rmap P3 */ +#if 0 /* P3 */ /* * This establishes kernel virtual mappings (e.g., as a result of a * vmalloc call). Since s390-esame uses a separate kernel page table, * there is nothing to do here... :) */ #define set_pgdir(vmaddr, entry) do { } while(0) +#endif /* * TLB flushing: diff -Nru a/include/asm-s390x/pgtable.h b/include/asm-s390x/pgtable.h --- a/include/asm-s390x/pgtable.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-s390x/pgtable.h Thu Apr 17 15:25:14 2003 @@ -63,7 +63,7 @@ #define PMD_MASK (~(PMD_SIZE-1)) /* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT 30 +#define PGDIR_SHIFT 31 #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -72,7 +72,7 @@ * currently we use a 3 level lookup */ #define PTRS_PER_PTE 512 -#define PTRS_PER_PMD 512 +#define PTRS_PER_PMD 1024 #define PTRS_PER_PGD 2048 /* @@ -169,15 +169,13 @@ /* Bits in the region third table entry */ #define _PGD_ENTRY_INV 0x20 /* region table entry invalid bit */ -#define _PGD_ENTRY_MASK 0x04 /* region third table entry mask */ -#define _PGD_ENTRY_LEN(x) ((x)&3) /* region table length bits */ -#define _PGD_ENTRY_OFF(x) (((x)&3)<<6) /* region table offset bits */ +#define _PGD_ENTRY 0x07 /* * User and kernel page directory */ #define _REGION_THIRD 0x4 -#define _REGION_THIRD_LEN 0x1 +#define _REGION_THIRD_LEN 0x3 #define _REGION_TABLE (_REGION_THIRD|_REGION_THIRD_LEN|0x40|0x100) #define _KERN_REGION_TABLE (_REGION_THIRD|_REGION_THIRD_LEN) @@ -254,37 +252,20 @@ /* * pgd/pmd/pte query functions */ -extern inline int __pgd_present(pgd_t *pgd) +extern inline int pgd_present(pgd_t pgd) { - unsigned long addr = (unsigned long) pgd; - unsigned long *pgd_slot = (unsigned long *) (addr & -8); - unsigned long offset = (addr & 4) >> 1; - - if (*pgd_slot & _PGD_ENTRY_INV) - return 0; - if ((*pgd_slot & _PGD_ENTRY_OFF(3)) > _PGD_ENTRY_OFF(offset)) - return 0; - if ((*pgd_slot & _PGD_ENTRY_LEN(3)) < _PGD_ENTRY_LEN(offset)) - return 0; - return 1; + return (pgd_val(pgd) & ~PAGE_MASK) == _PGD_ENTRY; } -#define pgd_present(pgd) __pgd_present(&(pgd)) -extern inline int __pgd_none(pgd_t *pgd) +extern inline int pgd_none(pgd_t pgd) { - return !__pgd_present(pgd); + return pgd_val(pgd) & _PGD_ENTRY_INV; } -#define pgd_none(pgd) __pgd_none(&(pgd)) -extern inline int __pgd_bad(pgd_t *pgd) +extern inline int pgd_bad(pgd_t pgd) { - unsigned long addr = (unsigned long) pgd; - unsigned long *pgd_slot = (unsigned long *) (addr & -8); - - return (*pgd_slot & (~PAGE_MASK & ~_PGD_ENTRY_INV & ~_PGD_ENTRY_MASK & - ~_PGD_ENTRY_LEN(3) & ~_PGD_ENTRY_OFF(3))) != 0; + return (pgd_val(pgd) & (~PAGE_MASK & ~_PGD_ENTRY_INV)) != _PGD_ENTRY; } -#define pgd_bad(pgd) __pgd_bad(&(pgd)) extern inline int pmd_present(pmd_t pmd) { @@ -346,27 +327,7 @@ */ extern inline void pgd_clear(pgd_t * pgdp) { - unsigned long addr = (unsigned long) pgdp; - unsigned long *pgd_slot = (unsigned long *) (addr & -8); - unsigned long offset = addr & 4; - - if (*pgd_slot & _PGD_ENTRY_INV) { - *pgd_slot = _PGD_ENTRY_INV; - return; - } - if (offset == 0 && (*pgd_slot & _PGD_ENTRY_LEN(2)) != 0) { - /* Clear lower pmd, upper pmd still used. */ - *pgd_slot = (*pgd_slot & PAGE_MASK) | _PGD_ENTRY_MASK | - _PGD_ENTRY_OFF(2) | _PGD_ENTRY_LEN(3); - return; - } - if (offset == 4 && (*pgd_slot & _PGD_ENTRY_OFF(2)) == 0) { - /* Clear upped pmd, lower pmd still used. */ - *pgd_slot = (*pgd_slot & PAGE_MASK) | _PGD_ENTRY_MASK | - _PGD_ENTRY_OFF(0) | _PGD_ENTRY_LEN(1); - return; - } - *pgd_slot = _PGD_ENTRY_INV; + pgd_val(*pgdp) = _PGD_ENTRY_INV | _PGD_ENTRY; } extern inline void pmd_clear(pmd_t * pmdp) @@ -512,8 +473,9 @@ #define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT))) -#define pmd_page(pmd) \ +#define pmd_page_kernel(pmd) \ ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +#define pmd_page(x) (mem_map+(unsigned long)((pmd_val(x) >> PAGE_SHIFT))) /* to find an entry in a page-table-directory */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) @@ -530,8 +492,17 @@ ((pmd_t *) pgd_page(dir) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) /* Find an entry in the third-level page table.. */ -#define pte_offset(dir,addr) \ - ((pte_t *) pmd_page(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) +#define __pte_offset(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) +#define pte_offset_kernel(pmd, address) \ + ((pte_t *) pmd_page_kernel(*pmd) + __pte_offset(address)) + +#define pte_offset_map(dir, address) \ + ((pte_t *) page_address(pmd_page(*(dir))) + __pte_offset(address)) +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) + +typedef u64 pte_addr_t; /* * A page-table entry has some bits we have to treat in a special way. diff -Nru a/include/asm-s390x/rmap.h b/include/asm-s390x/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-s390x/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _S390X_RMAP_H +#define _S390X_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-sh/rmap.h b/include/asm-sh/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-sh/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _SH_RMAP_H +#define _SH_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-sparc/rmap.h b/include/asm-sparc/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-sparc/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _SPARC_RMAP_H +#define _SPARC_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-sparc64/rmap.h b/include/asm-sparc64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-sparc64/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _SPARC64_RMAP_H +#define _SPARC64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h --- a/include/asm-x86_64/io.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-x86_64/io.h Thu Apr 17 15:25:14 2003 @@ -137,15 +137,6 @@ return __va(address); } -/* - * Change "struct page" to physical address. - */ -#ifdef CONFIG_DISCONTIGMEM -#include -#else -#define page_to_phys(page) (((page) - mem_map) << PAGE_SHIFT) -#endif - extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); extern inline void * ioremap (unsigned long offset, unsigned long size) diff -Nru a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h --- a/include/asm-x86_64/page.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-x86_64/page.h Thu Apr 17 15:25:14 2003 @@ -114,31 +114,31 @@ /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. Otherwise you risk miscompilation. */ #define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) -/* __pa_symbol should use for C visible symbols, but only for them. +/* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ #define __pa_symbol(x) \ ({unsigned long v; \ asm("" : "=r" (v) : "0" (x)); \ - v - __START_KERNEL_map; }) -#define __pa_maybe_symbol(x) \ - ({unsigned long v; \ - asm("" : "=r" (v) : "0" (x)); \ __pa(v); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #ifndef CONFIG_DISCONTIGMEM -#define virt_to_page(kaddr) (mem_map + (__pa(kaddr) >> PAGE_SHIFT)) -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((page) - mem_map) #define page_to_phys(page) (((page) - mem_map) << PAGE_SHIFT) +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_to_page(pfn) (mem_map + (pfn)) +#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) +#define pfn_valid(pfn) ((pfn) < max_mapnr) #define VALID_PAGE(page) (((page) - mem_map) < max_mapnr) +#define pfn_to_phys(pfn) ((unsigned long)(pfn) << PAGE_SHIFT) +#else +#include #endif -#define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) - +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #endif /* __KERNEL__ */ diff -Nru a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h --- a/include/asm-x86_64/pgalloc.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-x86_64/pgalloc.h Thu Apr 17 15:25:14 2003 @@ -9,137 +9,79 @@ #include #include -#define inc_pgcache_size() add_pda(pgtable_cache_sz,1UL) -#define dec_pgcache_size() sub_pda(pgtable_cache_sz,1UL) - -#define pmd_populate(mm, pmd, pte) \ +#define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) #define pgd_populate(mm, pgd, pmd) \ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pmd))) -extern __inline__ pmd_t *get_pmd_slow(void) +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL); + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); } -extern __inline__ pmd_t *get_pmd_fast(void) +extern __inline__ pmd_t *get_pmd(void) { - unsigned long *ret; - - if ((ret = read_pda(pmd_quick)) != NULL) { - write_pda(pmd_quick, (unsigned long *)(*ret)); - ret[0] = 0; - dec_pgcache_size(); - } else - ret = (unsigned long *)get_pmd_slow(); - return (pmd_t *)ret; + return (pmd_t *)get_zeroed_page(GFP_KERNEL); } extern __inline__ void pmd_free(pmd_t *pmd) { - *(unsigned long *)pmd = (unsigned long) read_pda(pmd_quick); - write_pda(pmd_quick,(unsigned long *) pmd); - inc_pgcache_size(); -} - -extern __inline__ void pmd_free_slow(pmd_t *pmd) -{ if ((unsigned long)pmd & (PAGE_SIZE-1)) - out_of_line_bug(); + BUG(); free_page((unsigned long)pmd); } -static inline pmd_t *pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr) -{ - unsigned long *ret = (unsigned long *)read_pda(pmd_quick); - - if (ret != NULL) { - write_pda(pmd_quick, (unsigned long *)(*ret)); - ret[0] = 0; - dec_pgcache_size(); - } - return (pmd_t *)ret; -} - static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL); -} - -static inline pgd_t *pgd_alloc_one_fast (void) -{ - unsigned long *ret = read_pda(pgd_quick); - - if (ret) { - write_pda(pgd_quick,(unsigned long *)(*ret)); - ret[0] = 0; - dec_pgcache_size(); - } - return (pgd_t *) ret; + return (pmd_t *) get_zeroed_page(GFP_KERNEL); } static inline pgd_t *pgd_alloc (struct mm_struct *mm) { - /* the VM system never calls pgd_alloc_one_fast(), so we do it here. */ - pgd_t *pgd = pgd_alloc_one_fast(); - - if (pgd == NULL) - pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); - return pgd; + return (pgd_t *)get_zeroed_page(GFP_KERNEL); } static inline void pgd_free (pgd_t *pgd) { - *(unsigned long *)pgd = (unsigned long) read_pda(pgd_quick); - write_pda(pgd_quick,(unsigned long *) pgd); - inc_pgcache_size(); -} - - -static inline void pgd_free_slow (pgd_t *pgd) -{ if ((unsigned long)pgd & (PAGE_SIZE-1)) - out_of_line_bug(); + BUG(); free_page((unsigned long)pgd); } - -static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)get_zeroed_page(GFP_KERNEL); + return (pte_t *) get_zeroed_page(GFP_KERNEL); } -extern __inline__ pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) -{ - unsigned long *ret; +#define pte_alloc_one_fast(x,y) (0) +#define pmd_alloc_one_fast(x,y) (0) +#define do_check_pgt_cache(x,y) (0) - if ((ret = read_pda(pte_quick)) != NULL) { - write_pda(pte_quick, (unsigned long *)(*ret)); - ret[0] = ret[1]; - dec_pgcache_size(); - } - return (pte_t *)ret; +static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + void *p = (void *)get_zeroed_page(GFP_KERNEL); + if (!p) + return NULL; + return virt_to_page(p); } -/* Should really implement gc for free page table pages. This could be done with - a reference count in struct page. */ +/* Should really implement gc for free page table pages. This could be + done with a reference count in struct page. */ -extern __inline__ void pte_free(pte_t *pte) -{ - *(unsigned long *)pte = (unsigned long) read_pda(pte_quick); - write_pda(pte_quick, (unsigned long *) pte); - inc_pgcache_size(); -} - -extern __inline__ void pte_free_slow(pte_t *pte) +extern __inline__ void pte_free_kernel(pte_t *pte) { if ((unsigned long)pte & (PAGE_SIZE-1)) - out_of_line_bug(); + BUG(); free_page((unsigned long)pte); } +extern inline void pte_free(struct page *pte) +{ + __free_page(pte); +} -extern int do_check_pgt_cache(int, int); +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +#define __pmd_free_tlb(tlb,x) pmd_free(x) /* * TLB flushing: diff -Nru a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h --- a/include/asm-x86_64/pgtable.h Thu Apr 17 15:25:14 2003 +++ b/include/asm-x86_64/pgtable.h Thu Apr 17 15:25:14 2003 @@ -18,6 +18,7 @@ #include #include #include +#include extern pgd_t level3_kernel_pgt[512]; extern pgd_t level3_physmem_pgt[512]; @@ -373,7 +374,7 @@ } #define page_pte(page) page_pte_prot(page, __pgprot(0)) -#define __pmd_page(pmd) (__va(pmd_val(pmd) & PHYSICAL_PAGE_MASK)) +#define pmd_page_kernel(pmd) (__va(pmd_val(pmd) & PHYSICAL_PAGE_MASK)) /* to find an entry in a page-table-directory. */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) @@ -396,9 +397,17 @@ /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset(dir, address) ((pte_t *) __pmd_page(*(dir)) + \ +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \ __pte_offset(address)) +/* x86-64 always has all page tables mapped. */ +#define pte_offset_map(dir, address) pte_offset_kernel(dir,address) +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address) +#define pte_unmap(pte) /* NOP */ +#define pte_unmap_nested(pte) /* NOP */ + +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) + /* never use these in the common code */ #define pml4_page(level4) ((unsigned long) __va(pml4_val(level4) & PHYSICAL_PAGE_MASK)) #define pml4_index(address) (((address) >> PML4_SHIFT) & (PTRS_PER_PML4-1)) @@ -419,6 +428,8 @@ #define SWP_ENTRY(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) #define pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define swp_entry_to_pte(x) ((pte_t) { (x).val }) + +typedef pte_t *pte_addr_t; struct page; /* diff -Nru a/include/asm-x86_64/rmap.h b/include/asm-x86_64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-x86_64/rmap.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,7 @@ +#ifndef _X64_64_RMAP_H +#define _X86_64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/linux/brlock.h b/include/linux/brlock.h --- a/include/linux/brlock.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/brlock.h Thu Apr 17 15:25:14 2003 @@ -34,6 +34,7 @@ enum brlock_indices { BR_GLOBALIRQ_LOCK, BR_NETPROTO_LOCK, + BR_LRU_LOCK, __BR_END }; diff -Nru a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/fs.h Thu Apr 17 15:25:14 2003 @@ -268,6 +268,7 @@ wait_queue_head_t b_wait; struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */ + struct list_head lru; /* Reclaim used buffers easily. */ }; typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); @@ -1374,6 +1375,8 @@ extern void inode_init_once(struct inode *); extern void iput(struct inode *); +extern void refile_inode(struct inode *inode); + extern void force_delete(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); @@ -1436,6 +1439,7 @@ /* Generic buffer handling for block filesystems.. */ extern int try_to_release_page(struct page * page, int gfp_mask); +extern int try_to_reclaim_buffers(int, unsigned int); extern int discard_bh_page(struct page *, unsigned long, int); #define block_flushpage(page, offset) discard_bh_page(page, offset, 1) #define block_invalidate_page(page) discard_bh_page(page, 0, 0) diff -Nru a/include/linux/highmem.h b/include/linux/highmem.h --- a/include/linux/highmem.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/highmem.h Thu Apr 17 15:25:14 2003 @@ -71,6 +71,7 @@ #define kmap_atomic(page,idx) kmap(page) #define kunmap_atomic(page,idx) kunmap(page) +#define kmap_atomic_to_page(ptr) virt_to_page(ptr) #define bh_kmap(bh) ((bh)->b_data) #define bh_kunmap(bh) do { } while (0) diff -Nru a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/mm.h Thu Apr 17 15:25:14 2003 @@ -1,5 +1,23 @@ #ifndef _LINUX_MM_H #define _LINUX_MM_H +/* + * Copyright (c) 2002. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: + * Linus Torvalds + * Stephen Tweedie + * Andrea Arcangeli + * Rik van Riel + * Arjan van de Ven + * and others + */ #include #include @@ -18,9 +36,6 @@ extern unsigned long num_mappedpages; extern void * high_memory; extern int page_cluster; -/* The inactive_clean lists are per zone. */ -extern struct list_head active_list; -extern struct list_head inactive_list; #include #include @@ -134,6 +149,9 @@ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); }; +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -151,7 +169,11 @@ */ typedef struct page { struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ + struct address_space *mapping; /* The inode (or ...) we belong to. + * protected by PG_locked and the + * pagecache_lock. Hold one to read, + * both to write. + */ unsigned long index; /* Our offset within mapping. */ struct page *next_hash; /* Next page sharing our hash bucket in the pagecache hash table. */ @@ -159,7 +181,13 @@ unsigned long flags; /* atomic flags, some possibly updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; - protected by pagemap_lru_lock !! */ + protected by the lru lock !! */ + union { + struct pte_chain *chain;/* Reverse pte mapping pointer. + * protected by PG_chainlock */ + pte_addr_t direct; + } pte; + unsigned char age; /* Page aging counter. */ struct page **pprev_hash; /* Complement to *next_hash. */ struct buffer_head * buffers; /* Buffer maps us to a disk block. */ @@ -266,7 +294,7 @@ * * Note that the referenced bit, the page->lru list_head and the * active, inactive_dirty and inactive_clean lists are protected by - * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit! + * the lru lock, and *NOT* by the usual PG_locked bit! * * PG_skip is used on sparc/sparc64 architectures to "skip" certain * parts of the address space. @@ -287,17 +315,22 @@ #define PG_referenced 2 #define PG_uptodate 3 #define PG_dirty 4 -#define PG_unused 5 -#define PG_lru 6 -#define PG_active 7 -#define PG_slab 8 -#define PG_skip 10 -#define PG_highmem 11 -#define PG_checked 12 /* kill me in 2.5.. */ -#define PG_arch_1 13 -#define PG_reserved 14 -#define PG_launder 15 /* written out by VM pressure.. */ -#define PG_fs_1 16 /* Filesystem specific */ +#define PG_active_anon 5 +#define PG_direct 6 +#define PG_inactive_dirty 7 +#define PG_inactive_laundry 8 +#define PG_inactive_clean 9 +#define PG_slab 10 +#define PG_skip 11 +#define PG_highmem 12 +#define PG_checked 13 /* kill me in 2.5.. */ +#define PG_arch_1 14 +#define PG_reserved 15 +#define PG_launder 16 /* written out by VM pressure.. */ +#define PG_chainlock 17 /* lock bit for ->pte_chain */ +#define PG_lru 18 +#define PG_active_cache 19 +#define PG_fs_1 20 /* Make it prettier to test the above... */ #define UnlockPage(page) unlock_page(page) @@ -317,6 +350,49 @@ #define ClearPageLaunder(page) clear_bit(PG_launder, &(page)->flags) /* + * inlines for acquisition and release of PG_chainlock + */ +static inline void pte_chain_lock(struct page *page) +{ + /* + * The preempt patch seems to be popular enough to + * warrant this little hack... + */ +#ifdef CONFIG_PREEMPT + preempt_disable(); +#endif + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ +#ifdef CONFIG_SMP + while (test_and_set_bit(PG_chainlock, &page->flags)) { + while (test_bit(PG_chainlock, &page->flags)) { + barrier(); + cpu_relax(); + } + } +#endif +} + +static inline void pte_chain_unlock(struct page *page) +{ +#ifdef CONFIG_SMP + clear_bit(PG_chainlock, &page->flags); +#endif + /* + * The preempt patch seems to be popular enough to + * warrant this little hack... + */ +#ifdef CONFIG_PREEMPT + preempt_enable(); +#endif +} + +/* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ @@ -378,6 +454,9 @@ * the clear_bit and the read of the waitqueue (to avoid SMP races with a * parallel wait_on_page). */ +#define PageDirect(page) test_bit(PG_direct, &(page)->flags) +#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) +#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) #define ClearPageError(page) clear_bit(PG_error, &(page)->flags) @@ -390,13 +469,34 @@ #define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) #define PageReserved(page) test_bit(PG_reserved, &(page)->flags) -#define PageActive(page) test_bit(PG_active, &(page)->flags) -#define SetPageActive(page) set_bit(PG_active, &(page)->flags) -#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) - -#define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define PageActiveAnon(page) test_bit(PG_active_anon, &(page)->flags) +#define SetPageActiveAnon(page) set_bit(PG_active_anon, &(page)->flags) +#define ClearPageActiveAnon(page) clear_bit(PG_active_anon, &(page)->flags) +#define TestandSetPageActiveAnon(page) test_and_set_bit(PG_active_anon, &(page)->flags) +#define TestandClearPageActiveAnon(page) test_and_clear_bit(PG_active_anon, &(page)->flags) + +#define PageActiveCache(page) test_bit(PG_active_cache, &(page)->flags) +#define SetPageActiveCache(page) set_bit(PG_active_cache, &(page)->flags) +#define ClearPageActiveCache(page) clear_bit(PG_active_cache, &(page)->flags) +#define TestandSetPageActiveCache(page) test_and_set_bit(PG_active_cache, &(page)->flags) +#define TestandClearPageActiveCache(page) test_and_clear_bit(PG_active_cache, &(page)->flags) + +#define PageInactiveLaundry(page) test_bit(PG_inactive_laundry, &(page)->flags) +#define SetPageInactiveLaundry(page) set_bit(PG_inactive_laundry, &(page)->flags) +#define ClearPageInactiveLaundry(page) clear_bit(PG_inactive_laundry, &(page)->flags) + +#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) +#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) +#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) + +#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) +#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) +#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) + +#define PageLRU(page) test_bit(PG_lru, &(page)->flags) +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) +#define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) +#define TestandSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) @@ -408,6 +508,16 @@ #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) /* + * Return true if this page is mapped into pagetables. Subtle: test pte.direct + * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain + * is only 32-bit. + */ +static inline int page_mapped(struct page *page) +{ + return page->pte.direct != 0; +} + +/* * Error return values for the *_nopage functions */ #define NOPAGE_SIGBUS (NULL) @@ -461,6 +571,7 @@ #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0) +extern void FASTCALL(fixup_freespace(struct zone_struct *, int)); extern void show_free_areas(void); extern void show_free_areas_node(pg_data_t *pgdat); @@ -479,7 +590,8 @@ extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); diff -Nru a/include/linux/mm_inline.h b/include/linux/mm_inline.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/mm_inline.h Thu Apr 17 15:25:14 2003 @@ -0,0 +1,424 @@ +#ifndef _LINUX_MM_INLINE_H +#define _LINUX_MM_INLINE_H + +#include +#include +#include + + +/* + * Copyright (c) 2002. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: + * Linus Torvalds + * Stephen Tweedie + * Andrea Arcangeli + * Rik van Riel + * Arjan van de Ven + * and others + */ + +GPL_HEADER() + +/* + * These inline functions tend to need bits and pieces of all the + * other VM include files, meaning they cannot be defined inside + * one of the other VM include files. + * + */ + +/** + * page_dirty - do we need to write the data out to disk + * @page: page to test + * + * Returns true if the page contains data which needs to + * be written to disk. Doesn't test the page tables (yet?). + */ +static inline int page_dirty(struct page *page) +{ + struct buffer_head *tmp, *bh; + + if (PageDirty(page)) + return 1; + + if (page->mapping && !page->buffers) + return 0; + + tmp = bh = page->buffers; + + do { + if (tmp->b_state & ((1<b_this_page; + } while (tmp != bh); + + return 0; +} + +/** + * page_anon - is this page ram/swap backed ? + * @page - page to test + * + * Returns 1 if the page is backed by ram/swap, 0 if the page is + * backed by a file in a filesystem on permanent storage. + */ +static inline int page_anon(struct page * page) +{ + /* Pages of an mmap()d file won't trigger this unless they get + * referenced on the inactive list and really are in the working + * set of the process... */ + if (page->pte.direct) + return 1; + + if (!page->mapping && !page->buffers) + return 1; + + if (PageSwapCache(page)) + return 1; + + /* TODO: ramfs, tmpfs shm segments and ramdisk */ + + return 0; +} + + + +static inline void add_page_to_active_anon_list(struct page * page, int age) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageActiveAnon(page); + list_add(&page->lru, &zone->active_anon_list[age]); + page->age = age + zone->anon_age_bias; + zone->active_anon_count[age]++; + zone->active_anon_pages++; +} + +static inline void add_page_to_active_cache_list(struct page * page, int age) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageActiveCache(page); + list_add(&page->lru, &zone->active_cache_list[age]); + page->age = age + zone->cache_age_bias; + zone->active_cache_count[age]++; + zone->active_cache_pages++; +} + +static inline void add_page_to_active_list(struct page * page, int age) +{ + if (page_anon(page)) + add_page_to_active_anon_list(page, age); + else + add_page_to_active_cache_list(page, age); +} + +static inline void add_page_to_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveDirty(page); + list_add(&page->lru, &zone->inactive_dirty_list); + zone->inactive_dirty_pages++; +} + +static inline void add_page_to_inactive_laundry_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveLaundry(page); + list_add(&page->lru, &zone->inactive_laundry_list); + zone->inactive_laundry_pages++; +} + +static inline void add_page_to_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveClean(page); + list_add(&page->lru, &zone->inactive_clean_list); + zone->inactive_clean_pages++; +} + +static inline void del_page_from_active_anon_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + unsigned char age; + list_del(&page->lru); + ClearPageActiveAnon(page); + zone->active_anon_pages--; + age = page->age - zone->anon_age_bias; + if (age<=MAX_AGE) + zone->active_anon_count[age]--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_active_cache_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + unsigned char age; + list_del(&page->lru); + ClearPageActiveCache(page); + zone->active_cache_pages--; + age = page->age - zone->cache_age_bias; + if (age<=MAX_AGE) + zone->active_cache_count[age]--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveDirty(page); + zone->inactive_dirty_pages--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_laundry_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveLaundry(page); + zone->inactive_laundry_pages--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveClean(page); + zone->inactive_clean_pages--; + DEBUG_LRU_PAGE(page); +} + +/* + * Inline functions to control some balancing in the VM. + * + * Note that we do both global and per-zone balancing, with + * most of the balancing done globally. + */ +#define PLENTY_FACTOR 2 +#define ALL_ZONES NULL +#define ANY_ZONE (struct zone_struct *)(~0UL) +#define INACTIVE_FACTOR 5 + +#define VM_MIN 0 +#define VM_LOW 1 +#define VM_HIGH 2 +#define VM_PLENTY 3 +static inline int zone_free_limit(struct zone_struct * zone, int limit) +{ + int free, target, delta; + + /* This is really nasty, but GCC should completely optimise it away. */ + if (limit == VM_MIN) + target = zone->pages_min; + else if (limit == VM_LOW) + target = zone->pages_low; + else if (limit == VM_HIGH) + target = zone->pages_high; + else + target = zone->pages_high * PLENTY_FACTOR; + + free = zone->free_pages + zone->inactive_clean_pages; + delta = target - free; + + return delta; +} + +static inline int free_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_free_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_free_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_free_limit(zone, limit); + } + + return shortage; +} + +/** + * free_min - test for critically low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a serious shortage of free and + * clean pages, zero or negative if there is no serious shortage. + */ +static inline int free_min(struct zone_struct * zone) +{ + return free_limit(zone, VM_MIN); +} + +/** + * free_low - test for low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a shortage of free and + * clean pages, zero or negative if there is no shortage. + */ +static inline int free_low(struct zone_struct * zone) +{ + return free_limit(zone, VM_LOW); +} + +/** + * free_high - test if amount of free pages is less than ideal + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free and clean + * pages is below kswapd's target, zero or negative if we + * have more than enough free and clean pages. + */ +static inline int free_high(struct zone_struct * zone) +{ + return free_limit(zone, VM_HIGH); +} + +/** + * free_plenty - test if enough pages are freed + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free + clean pages + * in a zone is not yet excessive and kswapd is still allowed to + * free pages here, a negative value if kswapd should leave the + * zone alone. + */ +static inline int free_plenty(struct zone_struct * zone) +{ + return free_limit(zone, VM_PLENTY); +} + +/* + * The inactive page target is the free target + 20% of (active + inactive) + * pages. + */ +static inline int zone_inactive_limit(struct zone_struct * zone, int limit) +{ + int inactive, target, inactive_base; + + inactive_base = zone->active_anon_pages + zone->active_cache_pages; + inactive_base /= INACTIVE_FACTOR; + + /* GCC should optimise this away completely. */ + if (limit == VM_MIN) + target = zone->pages_high + inactive_base / 2; + else if (limit == VM_LOW) + target = zone->pages_high + inactive_base; + else + target = zone->pages_high + inactive_base * 2; + + inactive = zone->free_pages + zone->inactive_clean_pages + + zone->inactive_dirty_pages + zone->inactive_laundry_pages; + + return target - inactive; +} + +static inline int inactive_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_inactive_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_inactive_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_inactive_limit(zone, limit); + } + + return shortage; +} + +/** + * inactive_min - test for serious shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no serious shortage of (free + inactive clean) pages + */ +static inline int inactive_min(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_MIN); +} + +/** + * inactive_low - test for shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no shortage of (free + inactive clean) pages + */ +static inline int inactive_low(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_LOW); +} + +/** + * inactive_high - less than ideal amount of (free + inactive) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have more than enough (free + inactive) pages + */ +static inline int inactive_high(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_HIGH); +} + +/* + * inactive_target - number of inactive pages we ought to have. + */ +static inline int inactive_target(void) +{ + int target; + + target = nr_active_anon_pages() + nr_active_cache_pages() + + nr_inactive_dirty_pages() + nr_inactive_clean_pages() + + nr_inactive_laundry_pages(); + + target /= INACTIVE_FACTOR; + + return target; +} + +static inline void lru_lock(struct zone_struct *zone) +{ + if (zone) { + br_read_lock(BR_LRU_LOCK); + spin_lock(&zone->lru_lock); + } else { + br_write_lock(BR_LRU_LOCK); + } +} + +static inline void lru_unlock(struct zone_struct *zone) +{ + if (zone) { + spin_unlock(&zone->lru_lock); + br_read_unlock(BR_LRU_LOCK); + } else { + br_write_unlock(BR_LRU_LOCK); + } +} + +#endif /* _LINUX_MM_INLINE_H */ diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/mmzone.h Thu Apr 17 15:25:14 2003 @@ -13,11 +13,7 @@ * Free memory management - zoned buddy allocator. */ -#ifndef CONFIG_FORCE_MAX_ZONEORDER #define MAX_ORDER 10 -#else -#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER -#endif typedef struct free_area_struct { struct list_head free_list; @@ -25,6 +21,16 @@ } free_area_t; struct pglist_data; +struct pte_chain; + +#define MAX_AGE 15 +#define INITIAL_AGE 3 + +#define MAX_PER_CPU_PAGES 512 +typedef struct per_cpu_pages_s { + int nr_pages, max_nr_pages; + struct list_head head; +} __attribute__((aligned(L1_CACHE_BYTES))) per_cpu_t; /* * On machines where it is needed (eg PCs) we divide physical memory @@ -38,15 +44,32 @@ /* * Commonly accessed fields: */ + per_cpu_t cpu_pages[NR_CPUS]; spinlock_t lock; unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; + unsigned long active_anon_pages; + unsigned long active_cache_pages; + unsigned long inactive_dirty_pages; + unsigned long inactive_laundry_pages; + unsigned long inactive_clean_pages; + unsigned long pages_min, pages_low, pages_high, pages_plenty; int need_balance; + int need_scan; + int active_anon_count[MAX_AGE+1]; + int active_cache_count[MAX_AGE+1]; + unsigned char anon_age_bias, cache_age_bias; + unsigned long age_next, age_interval; /* * free areas of different sizes */ + struct list_head active_anon_list[MAX_AGE+1]; + struct list_head active_cache_list[MAX_AGE+1]; + struct list_head inactive_dirty_list; + struct list_head inactive_laundry_list; + struct list_head inactive_clean_list; free_area_t free_area[MAX_ORDER]; + spinlock_t lru_lock; /* * wait_table -- the array holding the hash table @@ -142,9 +165,6 @@ extern int numnodes; extern pg_data_t *pgdat_list; - -#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \ - && ((pgzone) <= (classzone))) /* * The following two are not meant for general usage. They are here as diff -Nru a/include/linux/module.h b/include/linux/module.h --- a/include/linux/module.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/module.h Thu Apr 17 15:25:14 2003 @@ -287,6 +287,9 @@ static const char __module_license[] __attribute__((section(".modinfo"))) = \ "license=" license +#define GPL_HEADER() \ +static const char cpyright="This software may be freely redistributed under the terms of the GNU General Public License."; + /* Define the module variable, and usage macros. */ extern struct module __this_module; @@ -302,7 +305,6 @@ static const char __module_using_checksums[] __attribute__((section(".modinfo"))) = "using_checksums=1"; #endif - #else /* MODULE */ #define MODULE_AUTHOR(name) @@ -311,6 +313,7 @@ #define MODULE_SUPPORTED_DEVICE(name) #define MODULE_PARM(var,type) #define MODULE_PARM_DESC(var,desc) +#define GPL_HEADER() /* Create a dummy reference to the table to suppress gcc unused warnings. Put * the reference in the .data.exit section which is discarded when code is built diff -Nru a/include/linux/pagemap.h b/include/linux/pagemap.h --- a/include/linux/pagemap.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/pagemap.h Thu Apr 17 15:25:14 2003 @@ -90,6 +90,7 @@ extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash); extern void ___wait_on_page(struct page *); +extern int wait_on_page_timeout(struct page *page, int timeout); static inline void wait_on_page(struct page * page) { diff -Nru a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/sched.h Thu Apr 17 15:25:14 2003 @@ -235,7 +235,7 @@ unsigned long rss, total_vm, locked_vm; unsigned long def_flags; unsigned long cpu_vm_mask; - unsigned long swap_address; + unsigned long rlimit_rss; unsigned dumpable:1; @@ -254,6 +254,7 @@ mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ page_table_lock: SPIN_LOCK_UNLOCKED, \ mmlist: LIST_HEAD_INIT(name.mmlist), \ + rlimit_rss: RLIM_INFINITY, \ } struct signal_struct { @@ -335,8 +336,6 @@ struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; /* task state */ struct linux_binfmt *binfmt; diff -Nru a/include/linux/swap.h b/include/linux/swap.h --- a/include/linux/swap.h Thu Apr 17 15:25:14 2003 +++ b/include/linux/swap.h Thu Apr 17 15:25:14 2003 @@ -85,8 +85,11 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_buffer_pages(void); -extern int nr_active_pages; -extern int nr_inactive_pages; +extern unsigned int nr_active_anon_pages(void); +extern unsigned int nr_active_cache_pages(void); +extern unsigned int nr_inactive_dirty_pages(void); +extern unsigned int nr_inactive_laundry_pages(void); +extern unsigned int nr_inactive_clean_pages(void); extern atomic_t page_cache_size; extern atomic_t buffermem_pages; @@ -102,19 +105,62 @@ struct zone_t; +/* linux/mm/rmap.c */ +struct pte_chain; +extern int FASTCALL(page_referenced(struct page *, int *)); +extern struct pte_chain * FASTCALL(page_add_rmap(struct page *, pte_t *, + struct pte_chain *)); +extern void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +extern int FASTCALL(try_to_unmap(struct page *)); +struct pte_chain * pte_chain_alloc(int); +void __pte_chain_free(struct pte_chain *); + +static inline void pte_chain_free(struct pte_chain * pte_chain) +{ + if (pte_chain) + __pte_chain_free(pte_chain); +} + +/* return values of try_to_unmap */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 +#define SWAP_ERROR 3 + /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); +extern void FASTCALL(lru_cache_add_dirty(struct page *)); extern void FASTCALL(__lru_cache_del(struct page *)); extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(activate_page_nolock(struct page *)); +extern void FASTCALL(deactivate_page(struct page *)); +extern void FASTCALL(deactivate_page_nolock(struct page *)); +extern void FASTCALL(drop_page(struct page *)); extern void swap_setup(void); /* linux/mm/vmscan.c */ +extern struct page * FASTCALL(reclaim_page(zone_t *)); extern wait_queue_head_t kswapd_wait; -extern int FASTCALL(try_to_free_pages_zone(zone_t *, unsigned int)); -extern int FASTCALL(try_to_free_pages(unsigned int)); +extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask)); +extern int rebalance_laundry_zone(struct zone_struct *, int, unsigned int); +extern void wakeup_kswapd(unsigned int); +extern void rss_free_pages(unsigned int); + +/* + * Limits, in percent, on how large the cache can be and how to do + * page reclaiming. If the cache is more than borrow% in size, we + * reclaim pages from the cache and won't swap out application pages. + * Check mm/vmscan.c for implementation details. + */ +struct cache_limits { + int min; + int borrow; + int max; +}; +extern struct cache_limits cache_limits; /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -128,6 +174,7 @@ extern void show_swap_cache_info(void); #endif extern int add_to_swap_cache(struct page *, swp_entry_t); +extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *page); extern void delete_from_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *page); @@ -158,49 +205,34 @@ asmlinkage long sys_swapoff(const char *); asmlinkage long sys_swapon(const char *, int); -extern spinlock_cacheline_t pagemap_lru_lock_cacheline; -#define pagemap_lru_lock pagemap_lru_lock_cacheline.lock extern void FASTCALL(mark_page_accessed(struct page *)); /* + * Page aging defines. These seem to work great in FreeBSD, + * no need to reinvent the wheel. + */ +#define PAGE_AGE_START 5 +#define PAGE_AGE_ADV 3 +#define PAGE_AGE_DECL 1 +#define PAGE_AGE_MAX 64 + +/* * List add/del helper macros. These must be called - * with the pagemap_lru_lock held! + * with the lru lock held! */ #define DEBUG_LRU_PAGE(page) \ do { \ - if (!PageLRU(page)) \ + if (PageActiveAnon(page)) \ BUG(); \ - if (PageActive(page)) \ + if (PageActiveCache(page)) \ + BUG(); \ + if (PageInactiveDirty(page)) \ + BUG(); \ + if (PageInactiveLaundry(page)) \ + BUG(); \ + if (PageInactiveClean(page)) \ BUG(); \ -} while (0) - -#define add_page_to_active_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ -} while (0) - -#define add_page_to_inactive_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ -} while (0) - -#define del_page_from_active_list(page) \ -do { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - nr_active_pages--; \ -} while (0) - -#define del_page_from_inactive_list(page) \ -do { \ - list_del(&(page)->lru); \ - nr_inactive_pages--; \ } while (0) extern spinlock_t swaplock; diff -Nru a/init/main.c b/init/main.c --- a/init/main.c Thu Apr 17 15:25:14 2003 +++ b/init/main.c Thu Apr 17 15:25:14 2003 @@ -94,6 +94,7 @@ extern void sysctl_init(void); extern void signals_init(void); extern int init_pcmcia_ds(void); +extern void pte_chain_init(void); extern void free_initmem(void); @@ -397,6 +398,7 @@ mem_init(); kmem_cache_sizes_init(); pgtable_cache_init(); + pte_chain_init(); /* * For architectures that have highmem, num_mappedpages represents diff -Nru a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c Thu Apr 17 15:25:14 2003 +++ b/kernel/fork.c Thu Apr 17 15:25:14 2003 @@ -152,7 +152,6 @@ mm->map_count = 0; mm->rss = 0; mm->cpu_vm_mask = 0; - mm->swap_address = 0; pprev = &mm->mmap; /* @@ -276,9 +275,6 @@ void mmput(struct mm_struct *mm) { if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { - extern struct mm_struct *swap_mm; - if (swap_mm == mm) - swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); @@ -701,8 +697,6 @@ #endif p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - - INIT_LIST_HEAD(&p->local_pages); retval = -ENOMEM; /* copy all the process information */ diff -Nru a/kernel/sys.c b/kernel/sys.c --- a/kernel/sys.c Thu Apr 17 15:25:14 2003 +++ b/kernel/sys.c Thu Apr 17 15:25:14 2003 @@ -1147,6 +1147,12 @@ if (resource == RLIMIT_NOFILE) { if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN) return -EPERM; + } else if (resource == RLIMIT_RSS && current->mm) { + /* rlimit is specified in bytes, convert to pages */ + unsigned long pages = RLIM_INFINITY; + if (new_rlim.rlim_cur != RLIM_INFINITY) + pages = new_rlim.rlim_cur >> PAGE_SHIFT; + current->mm->rlimit_rss = pages; } *old_rlim = new_rlim; return 0; diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c --- a/kernel/sysctl.c Thu Apr 17 15:25:14 2003 +++ b/kernel/sysctl.c Thu Apr 17 15:25:14 2003 @@ -268,6 +268,8 @@ &bdflush_min, &bdflush_max}, {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory, sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec}, + {VM_PAGECACHE, "pagecache", &cache_limits, + sizeof(struct cache_limits), 0644, NULL, &proc_dointvec}, {VM_PAGERDAEMON, "kswapd", &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec}, {VM_PGT_CACHE, "pagetable_cache", diff -Nru a/mm/Makefile b/mm/Makefile --- a/mm/Makefile Thu Apr 17 15:25:14 2003 +++ b/mm/Makefile Thu Apr 17 15:25:14 2003 @@ -14,7 +14,7 @@ obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o rmap.o obj-$(CONFIG_HIGHMEM) += highmem.o diff -Nru a/mm/TODO b/mm/TODO --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/TODO Thu Apr 17 15:25:14 2003 @@ -0,0 +1,38 @@ + VM TODO list + +Forever valid TODO entries: + - keep up with the official kernel + - port over bugfixes + - minimise the diff by keeping code in sync where possible + +Easy short-term features: + - reclaim swap space from refill_inactive() + - simplify SMP locking + - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with + one single function using a for_each_pte() macro + for_each_pte(ptep, mm, start_address, end_address) + - fix page_launder() to not eat horrible amounts of CPU or flush + all pages to disk at once + - better VM balancing, clean vs. dirty ratio + - fix loopback device deadlock + riel: nr_fract=70%, nr_fract_sync=80% + riel: setup a loopback fs ext2-on-ext2 + riel: boot with mem=64m + riel: then write a 500 meg file. + riel: current kernel livelocks. + - stabilise pte_highmem and integrate it with rmap + - page_cache_size per zone + - pte_chain list per zone + - get rid of other global structures/stats, make them per zone + +Long-term features: + - extensive VM statistics + - IO clustering for page_launder() and sync_old_buffers() + - readahead on per-VMA level (+ drop behind?) + - more graceful degradation when the load gets high + - reducing readahead + - unfair pageout so not all apps fall over + - memory objects, using pagecache and tmpfs for storage so + the memory object itself doesn't introduce any new overhead + - using the memory objects, removing page table copying from fork() + - load control able to deal with really extreme loads, swapping diff -Nru a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c Thu Apr 17 15:25:14 2003 +++ b/mm/filemap.c Thu Apr 17 15:25:14 2003 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -54,15 +55,14 @@ spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED}; /* - * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock + * NOTE: to avoid deadlocking you must never acquire the lru lock * with the pagecache_lock held. * * Ordering: * swap_lock -> - * pagemap_lru_lock -> + * lru lock -> * pagecache_lock */ -spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED}; #define CLUSTER_PAGES (1 << page_cluster) #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) @@ -100,6 +100,8 @@ mapping->nrpages--; list_del(&page->list); + if (!mapping->nrpages) + refile_inode(mapping->host); page->mapping = NULL; } @@ -185,7 +187,7 @@ head = &inode->i_mapping->clean_pages; - spin_lock(&pagemap_lru_lock); + lru_lock(ALL_ZONES); spin_lock(&pagecache_lock); curr = head->next; @@ -207,6 +209,7 @@ if (page_count(page) != 1) goto unlock; + /* Manual lru del to avoid lock ordering problems */ __lru_cache_del(page); __remove_inode_page(page); UnlockPage(page); @@ -218,7 +221,7 @@ } spin_unlock(&pagecache_lock); - spin_unlock(&pagemap_lru_lock); + lru_unlock(ALL_ZONES); } static int do_flushpage(struct page *page, unsigned long offset) @@ -239,8 +242,11 @@ static void truncate_complete_page(struct page *page) { - /* Leave it on the LRU if it gets converted into anonymous buffers */ - if (!page->buffers || do_flushpage(page, 0)) + /* + * Leave it on the LRU if it gets converted into anonymous buffers + * or anonymous process memory. + */ + if ((!page->buffers || do_flushpage(page, 0)) && !page->pte.direct) lru_cache_del(page); /* @@ -877,6 +883,32 @@ wake_up_all(waitqueue); } + +/* like wait_on_page but with a timeout (in jiffies). + * returns 1 on timeout + */ +int wait_on_page_timeout(struct page *page, int timeout) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + if (!PageLocked(page)) + return 0; + + add_wait_queue(waitqueue, &wait); + do { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!PageLocked(page)) + break; + sync_page(page); + timeout = schedule_timeout(timeout); + } while (PageLocked(page) && timeout); + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(waitqueue, &wait); + return PageLocked(page); +} + /* * Get a lock on the page, assuming we need to sleep * to get it.. @@ -1032,6 +1064,54 @@ } /* + * We combine this with read-ahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * TODO: + * - make the readahead code smarter + * - move readahead to the VMA level so we can do the same + * trick with mmap() + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long start; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + if (index > file->f_rawin) + start = index - file->f_rawin; + else + start = 0; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + lru_lock(ALL_ZONES); + while (--index >= start) { + struct page **hash = page_hash(mapping, index); + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + spin_unlock(&pagecache_lock); + if (!page || !PageActiveCache(page)) + break; + drop_page(page); + } + lru_unlock(ALL_ZONES); +} + +/* * Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should @@ -1302,6 +1382,12 @@ if (filp->f_ramax > max_readahead) filp->f_ramax = max_readahead; + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(filp, index); + #ifdef PROFILE_READAHEAD profile_readahead((reada_ok == 2), filp); #endif @@ -1313,16 +1399,23 @@ /* * Mark a page as having seen activity. * - * If it was already so marked, move it to the active queue and drop - * the referenced bit. Otherwise, just mark it for future action.. + * We immediately reclaim the inactive clean pages because those are + * counted as freeable. We don't modify the inactive dirty ones because + * we're never sure if those are freeable anyway. */ void mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page)) { + /* Mark the page referenced, AFTER checking for previous usage.. */ + SetPageReferenced(page); + + if (unlikely(PageInactiveClean(page) || PageInactiveLaundry(page))) { + struct zone_struct *zone = page_zone(page); + int free = zone->free_pages + zone->inactive_clean_pages; + activate_page(page); - ClearPageReferenced(page); - } else - SetPageReferenced(page); + if (free <= zone->pages_min) + wakeup_kswapd(GFP_NOIO); + } } /* @@ -1860,7 +1953,7 @@ nr = max; /* And limit it to a sane percentage of the inactive list.. */ - max = nr_inactive_pages / 2; + max = (nr_inactive_clean_pages() + nr_inactive_laundry_pages()) / 2; if (nr > max) nr = max; @@ -2108,7 +2201,8 @@ struct page *page = pte_page(pte); if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) { flush_tlb_page(vma, address); - set_page_dirty(page); + set_page_dirty(page); /* This actually does not sleep */ + return 0; } } return 0; @@ -2118,7 +2212,7 @@ unsigned long address, unsigned long size, struct vm_area_struct *vma, unsigned long offset, unsigned int flags) { - pte_t * pte; + pte_t *pte, *mapping; unsigned long end; int error; @@ -2129,7 +2223,7 @@ pmd_clear(pmd); return 0; } - pte = pte_offset(pmd, address); + mapping = pte = pte_offset_map(pmd, address); offset += address & PMD_MASK; address &= ~PMD_MASK; end = address + size; @@ -2141,6 +2235,7 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + pte_unmap(mapping); return error; } @@ -3081,6 +3176,7 @@ unsigned long index, offset; long page_fault; char *kaddr; + int deactivate = 1; /* * Try to find the page in the cache. If it isn't there, @@ -3089,8 +3185,10 @@ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + deactivate = 0; + } /* * Bring in the user page that we will copy from _first_. @@ -3134,8 +3232,11 @@ unlock: kunmap(page); /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); + if (deactivate) + deactivate_page(page); + else + mark_page_accessed(page); page_cache_release(page); if (status < 0) diff -Nru a/mm/memory.c b/mm/memory.c --- a/mm/memory.c Thu Apr 17 15:25:14 2003 +++ b/mm/memory.c Thu Apr 17 15:25:14 2003 @@ -45,8 +45,10 @@ #include #include #include +#include #include +#include #include #include @@ -92,7 +94,7 @@ */ static inline void free_one_pmd(pmd_t * dir) { - pte_t * pte; + struct page *pte; if (pmd_none(*dir)) return; @@ -101,8 +103,9 @@ pmd_clear(dir); return; } - pte = pte_offset(dir, 0); + pte = pmd_page(*dir); pmd_clear(dir); + pgtable_remove_rmap(pte); pte_free(pte); } @@ -138,6 +141,62 @@ return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]); } +pte_t *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (!pmd_present(*pmd)) { + struct page *new; + + new = pte_alloc_one_fast(mm, address); + if (!new) { + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + } + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free(new); + goto out; + } + pgtable_add_rmap(new, mm, address); + pmd_populate(mm, pmd, new); + } +out: + if (pmd_present(*pmd)) + return pte_offset_map(pmd, address); + return NULL; +} + +pte_t *pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (!pmd_present(*pmd)) { + pte_t *new; + + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one_kernel(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free_kernel(new); + goto out; + } + pmd_populate_kernel(mm, pmd, new); + } +out: + return pte_offset_kernel(pmd, address); +} + /* * This function clears all user-level page tables of a process - this @@ -171,7 +230,7 @@ * variable count and make things faster. -jj * * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc(). + * but may be dropped within pmd_alloc() and pte_alloc_map(). */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -180,6 +239,16 @@ unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + struct pte_chain * pte_chain = NULL; + + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + spin_unlock(&dst->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + spin_lock(&dst->page_table_lock); + if (!pte_chain) + goto nomem; + } src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -223,12 +292,12 @@ goto cont_copy_pmd_range; } - src_pte = pte_offset(src_pmd, address); - dst_pte = pte_alloc(dst, dst_pmd, address); + dst_pte = pte_alloc_map(dst, dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; struct page *ptepage; @@ -237,9 +306,11 @@ if (pte_none(pte)) goto cont_copy_pte_range_noset; + /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || @@ -260,23 +331,53 @@ dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); + pte_chain = page_add_rmap(ptepage, dst_pte, + pte_chain); + if (pte_chain) + goto cont_copy_pte_range_noset; + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (pte_chain) + goto cont_copy_pte_range_noset; + + /* + * pte_chain allocation failed, and we need to + * run page reclaim. + */ + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + spin_lock(&dst->page_table_lock); + if (!pte_chain) + goto nomem; + spin_lock(&src->page_table_lock); + dst_pte = pte_offset_map(dst_pmd, address); + src_pte = pte_offset_map_nested(src_pmd, + address); cont_copy_pte_range_noset: address += PAGE_SIZE; - if (address >= end) - goto out_unlock; + if (address >= end) { + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + spin_unlock(&src->page_table_lock); + goto out; + } src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); + pte_unmap_nested(src_pte-1); + pte_unmap(dst_pte-1); spin_unlock(&src->page_table_lock); cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } -out_unlock: - spin_unlock(&src->page_table_lock); out: + pte_chain_free(pte_chain); return 0; nomem: + pte_chain_free(pte_chain); return -ENOMEM; } @@ -294,7 +395,7 @@ static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size) { unsigned long offset; - pte_t * ptep; + pte_t * ptep, *mapping; int freed = 0; if (pmd_none(*pmd)) @@ -304,7 +405,7 @@ pmd_clear(pmd); return 0; } - ptep = pte_offset(pmd, address); + mapping = ptep = pte_offset_map(pmd, address); offset = address & ~PMD_MASK; if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; @@ -315,8 +416,10 @@ continue; if (pte_present(pte)) { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && !PageReserved(page)) + if (VALID_PAGE(page) && !PageReserved(page)) { freed ++; + page_remove_rmap(page, ptep); + } /* This will eventually call __free_pte on the pte. */ tlb_remove_page(tlb, ptep, address + offset); } else { @@ -324,6 +427,7 @@ pte_clear(ptep); } } + pte_unmap(mapping); return freed; } @@ -354,49 +458,65 @@ return freed; } -/* - * remove user pages in a given range. +#define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) + +/** + * zap_page_range - remove user pages in a given range + * @mm: mm_struct containing the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap */ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) { mmu_gather_t *tlb; pgd_t * dir; - unsigned long start = address, end = address + size; - int freed = 0; - - dir = pgd_offset(mm, address); - + unsigned long start, end, addr, block; + int freed; + /* - * This is a long-lived spinlock. That's fine. - * There's no contention, because the page table - * lock only protects against kswapd anyway, and - * even if kswapd happened to be looking at this - * process we _want_ it to get stuck. + * Break the work up into blocks of ZAP_BLOCK_SIZE pages: + * this decreases lock-hold time for the page_table_lock + * dramatically, which could otherwise be held for a very + * long time. This decreases lock contention and increases + * periods of preemptibility. */ - if (address >= end) - BUG(); - spin_lock(&mm->page_table_lock); - flush_cache_range(mm, address, end); - tlb = tlb_gather_mmu(mm); + while (size) { + if (size > ZAP_BLOCK_SIZE) + block = ZAP_BLOCK_SIZE; + else + block = size; + + freed = 0; + start = addr = address; + end = address + block; + dir = pgd_offset(mm, address); - do { - freed += zap_pmd_range(tlb, dir, address, end - address); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); + BUG_ON(address >= end); - /* this will flush any remaining tlb entries */ - tlb_finish_mmu(tlb, start, end); + spin_lock(&mm->page_table_lock); + flush_cache_range(mm, start, end); + tlb = tlb_gather_mmu(mm); - /* - * Update rss for the mm_struct (not necessarily current->mm) - * Notice that rss is an unsigned long. - */ - if (mm->rss > freed) - mm->rss -= freed; - else - mm->rss = 0; - spin_unlock(&mm->page_table_lock); + do { + freed += zap_pmd_range(tlb, dir, addr, end - addr); + addr = (addr + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (addr && (addr < end)); + + /* this will flush any remaining tlb entries */ + tlb_finish_mmu(tlb, start, end); + + /* Update rss for the mm_struct (need not be current->mm) */ + if (mm->rss > freed) + mm->rss -= freed; + else + mm->rss = 0; + + spin_unlock(&mm->page_table_lock); + + address += block; + size -= block; + } } /* @@ -407,6 +527,7 @@ pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; + struct page *page = NULL; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || pgd_bad(*pgd)) @@ -416,19 +537,19 @@ if (pmd_none(*pmd) || pmd_bad(*pmd)) goto out; - ptep = pte_offset(pmd, address); + ptep = pte_offset_map(pmd, address); if (!ptep) goto out; pte = *ptep; + pte_unmap(ptep); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) - return pte_page(pte); + page = pte_page(pte); } - out: - return 0; + return page; } /* @@ -777,10 +898,11 @@ if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc(mm, pmd, address); + pte_t * pte = pte_alloc_map(mm, pmd, address); if (!pte) return -ENOMEM; zeromap_pte_range(pte, address, end - address, prot); + pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -849,18 +971,20 @@ static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long phys_addr, pgprot_t prot) { - unsigned long end; + unsigned long base, end; + base = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc(mm, pmd, address); + pte_t * pte = pte_alloc_map(mm, pmd, address + base); if (!pte) return -ENOMEM; - remap_pte_range(pte, address, end - address, address + phys_addr, prot); + remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -946,9 +1070,10 @@ * with the page_table_lock released. */ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, pte_t *page_table, pte_t pte) + unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) { struct page *old_page, *new_page; + struct pte_chain * pte_chain = NULL; old_page = pte_page(pte); if (!VALID_PAGE(old_page)) @@ -960,10 +1085,12 @@ if (reuse) { flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); return 1; /* Minor fault */ } } + pte_unmap(page_table); /* * Ok, we need to copy. Oh, well.. @@ -971,6 +1098,9 @@ page_cache_get(old_page); spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto no_mem; new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_mem; @@ -980,26 +1110,33 @@ * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; + page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add(new_page); /* Free the old page.. */ new_page = old_page; } + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); + pte_chain_free(pte_chain); return 1; /* Minor fault */ bad_wp_page: + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); return -1; no_mem: page_cache_release(old_page); + pte_chain_free(pte_chain); return -1; } @@ -1096,6 +1233,10 @@ struct page *new_page; unsigned long offset; + /* Low on free memory ? Don't make things worse. */ + if (free_low(ALL_ZONES) < 0) + return; + /* * Get the number of handles we should do readahead io to. */ @@ -1116,13 +1257,15 @@ */ static int do_swap_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, - pte_t * page_table, pte_t orig_pte, int write_access) + pte_t * page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); + struct pte_chain * pte_chain = NULL; pte_t pte; int ret = 1; + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { @@ -1135,7 +1278,9 @@ */ int retval; spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); retval = pte_same(*page_table, orig_pte) ? -1 : 1; + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); return retval; } @@ -1145,7 +1290,11 @@ } mark_page_accessed(page); - + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + page_cache_release(page); + return -1; + } lock_page(page); /* @@ -1153,10 +1302,13 @@ * released the page table lock. */ spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); unlock_page(page); page_cache_release(page); + pte_chain_free(pte_chain); return 1; } @@ -1175,10 +1327,13 @@ flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); return ret; } @@ -1187,18 +1342,31 @@ * spinlock held to protect against concurrent faults in * multithreaded programs. */ -static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) +static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr) { pte_t entry; + struct page * page = ZERO_PAGE(addr); + struct pte_chain * pte_chain; + int ret; + + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto no_mem; + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); + } /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { - struct page *page; - /* Allocate our own private page. */ + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1207,27 +1375,36 @@ clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { page_cache_release(page); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - return 1; + ret = 1; + goto out; } mm->rss++; flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); - mark_page_accessed(page); } set_pte(page_table, entry); + /* ignores ZERO PAGE */ + pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - return 1; /* Minor fault */ + ret = 1; /* Minor fault */ + goto out; no_mem: - return -1; + ret = -1; +out: + pte_chain_free(pte_chain); + return ret; } /* @@ -1243,13 +1420,15 @@ * spinlock held. Exit with the spinlock released. */ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *page_table) + unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { + struct pte_chain * pte_chain; struct page * new_page; pte_t entry; if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, write_access, address); + return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); @@ -1274,7 +1453,15 @@ new_page = page; } + mark_page_accessed(new_page); + + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + page_cache_release(new_page); + return -1; + } spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); /* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it's valid @@ -1294,16 +1481,21 @@ if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); + pte_chain = page_add_rmap(new_page, page_table, pte_chain); + pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ + pte_unmap(page_table); page_cache_release(new_page); spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); return 1; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); return 2; /* Major fault */ } @@ -1330,7 +1522,7 @@ */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, - int write_access, pte_t * pte) + int write_access, pte_t *pte, pmd_t *pmd) { pte_t entry; @@ -1342,18 +1534,19 @@ * drop the lock. */ if (pte_none(entry)) - return do_no_page(mm, vma, address, write_access, pte); - return do_swap_page(mm, vma, address, pte, entry, write_access); + return do_no_page(mm, vma, address, write_access, pte, pmd); + return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); } if (write_access) { if (!pte_write(entry)) - return do_wp_page(mm, vma, address, pte, entry); + return do_wp_page(mm, vma, address, pte, pmd, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); + pte_unmap(pte); spin_unlock(&mm->page_table_lock); return 1; } @@ -1370,6 +1563,14 @@ current->state = TASK_RUNNING; pgd = pgd_offset(mm, address); + /* + * If we are over our RSS limit and the system needs memory, + * we will free memory for the non-hogs and slow down a bit. + */ + if (mm->rlimit_rss && mm->rss > mm->rlimit_rss && + free_high(ALL_ZONES) > 0) + rss_free_pages(GFP_HIGHUSER); + /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. @@ -1378,9 +1579,9 @@ pmd = pmd_alloc(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc(mm, pmd, address); + pte_t * pte = pte_alloc_map(mm, pmd, address); if (pte) - return handle_pte_fault(mm, vma, address, write_access, pte); + return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } spin_unlock(&mm->page_table_lock); return -1; @@ -1422,41 +1623,6 @@ return pmd_offset(pgd, address); } -/* - * Allocate the page table directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. - */ -pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) -{ - if (pmd_none(*pmd)) { - pte_t *new; - - /* "fast" allocation can happen without dropping the lock.. */ - new = pte_alloc_one_fast(mm, address); - if (!new) { - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (!pmd_none(*pmd)) { - pte_free(new); - goto out; - } - } - pmd_populate(mm, pmd, new); - } -out: - return pte_offset(pmd, address); -} - int make_pages_present(unsigned long addr, unsigned long end) { int ret, len, write; @@ -1486,10 +1652,12 @@ if (!pgd_none(*pgd)) { pmd = pmd_offset(pgd, addr); if (!pmd_none(*pmd)) { - pte = pte_offset(pmd, addr); + /* FIXME: shouldn't this be pte_offset_kernel ??? */ + pte = pte_offset_map(pmd, addr); if (pte_present(*pte)) { page = pte_page(*pte); } + pte_unmap(pte); } } return page; diff -Nru a/mm/mprotect.c b/mm/mprotect.c --- a/mm/mprotect.c Thu Apr 17 15:25:14 2003 +++ b/mm/mprotect.c Thu Apr 17 15:25:14 2003 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -15,7 +16,7 @@ static inline void change_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, pgprot_t newprot) { - pte_t * pte; + pte_t *pte, *mapping; unsigned long end; if (pmd_none(*pmd)) @@ -25,7 +26,7 @@ pmd_clear(pmd); return; } - pte = pte_offset(pmd, address); + mapping = pte = pte_offset_map(pmd, address); address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -44,6 +45,7 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + pte_unmap(mapping); } static inline void change_pmd_range(pgd_t * pgd, unsigned long address, diff -Nru a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c Thu Apr 17 15:25:14 2003 +++ b/mm/mremap.c Thu Apr 17 15:25:14 2003 @@ -9,13 +9,14 @@ #include #include #include +#include #include #include extern int vm_enough_memory(long pages); -static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) +static inline pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) { pgd_t * pgd; pmd_t * pmd; @@ -39,30 +40,55 @@ goto end; } - pte = pte_offset(pmd, addr); - if (pte_none(*pte)) + pte = pte_offset_map_nested(pmd, addr); + if (pte_none(*pte)) { + pte_unmap_nested(pte); pte = NULL; + } end: return pte; } -static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) +#ifdef CONFIG_HIGHPTE /* Save a few cycles on the sane machines */ +static inline int page_table_present(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return 0; + pmd = pmd_offset(pgd, addr); + return pmd_present(*pmd); +} +#else +#define page_table_present(mm, addr) (1) +#endif + +static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) { pmd_t * pmd; pte_t * pte = NULL; pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); if (pmd) - pte = pte_alloc(mm, pmd, addr); + pte = pte_alloc_map(mm, pmd, addr); return pte; } -static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) +static int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst, + struct pte_chain ** pte_chainp) { int error = 0; pte_t pte; + struct page * page = NULL; + + if (pte_present(*src)) + page = pte_page(*src); if (!pte_none(*src)) { + if (page) + page_remove_rmap(page, src); pte = ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -70,29 +96,53 @@ error++; } set_pte(dst, pte); + if (page) + *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } return error; } -static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) +static int move_one_page(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr) { + struct mm_struct *mm = vma->vm_mm; + struct pte_chain * pte_chain; int error = 0; - pte_t * src; + pte_t *src, *dst; + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + return -1; spin_lock(&mm->page_table_lock); - src = get_one_pte(mm, old_addr); - if (src) - error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr)); + src = get_one_pte_map_nested(mm, old_addr); + if (src) { + /* + * Look to see whether alloc_one_pte_map needs to perform a + * memory allocation. If it does then we need to drop the + * atomic kmap + */ + if (!page_table_present(mm, new_addr)) { + pte_unmap_nested(src); + src = NULL; + } + dst = alloc_one_pte_map(mm, new_addr); + if (src == NULL) + src = get_one_pte_map_nested(mm, old_addr); + error = copy_one_pte(mm, src, dst, &pte_chain); + pte_unmap_nested(src); + pte_unmap(dst); + } + flush_tlb_page(vma, old_addr); spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); return error; } -static int move_page_tables(struct mm_struct * mm, +static int move_page_tables(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_addr, unsigned long len) { unsigned long offset = len; - flush_cache_range(mm, old_addr, old_addr + len); + flush_cache_range(vma, old_addr, old_addr + len); /* * This is not the clever way to do this, but we're taking the @@ -101,10 +151,9 @@ */ while (offset) { offset -= PAGE_SIZE; - if (move_one_page(mm, old_addr + offset, new_addr + offset)) + if (move_one_page(vma, old_addr + offset, new_addr + offset)) goto oops_we_failed; } - flush_tlb_range(mm, old_addr, old_addr + len); return 0; /* @@ -115,14 +164,14 @@ * the old page tables) */ oops_we_failed: - flush_cache_range(mm, new_addr, new_addr + len); + flush_cache_range(vma, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) - move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); + move_one_page(vma, new_addr + offset, old_addr + offset); + zap_page_range(vma->vm_mm, new_addr, len); return -1; } -static inline unsigned long move_vma(struct vm_area_struct * vma, +static unsigned long move_vma(struct vm_area_struct * vma, unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr) { @@ -146,7 +195,8 @@ prev->vm_end = next->vm_end; __vma_unlink(mm, next, prev); spin_unlock(&mm->page_table_lock); - + if (vma == next) + vma = prev; mm->map_count--; kmem_cache_free(vm_area_cachep, next); } @@ -176,7 +226,7 @@ allocated_vma = 1; } - if (!move_page_tables(current->mm, new_addr, addr, old_len)) { + if (!move_page_tables(vma, new_addr, addr, old_len)) { if (allocated_vma) { *new_vma = *vma; new_vma->vm_start = new_addr; @@ -252,12 +302,14 @@ /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. + * do_munmap does all the needed commit accounting */ ret = addr; if (old_len >= new_len) { do_munmap(current->mm, addr+new_len, old_len - new_len); if (!(flags & MREMAP_FIXED) || (new_addr == addr)) goto out; + old_len = new_len; } /* diff -Nru a/mm/oom_kill.c b/mm/oom_kill.c --- a/mm/oom_kill.c Thu Apr 17 15:25:14 2003 +++ b/mm/oom_kill.c Thu Apr 17 15:25:14 2003 @@ -168,6 +168,7 @@ static void oom_kill(void) { struct task_struct *p, *q; + extern wait_queue_head_t kswapd_done; read_lock(&tasklist_lock); p = select_bad_process(); @@ -183,6 +184,9 @@ } read_unlock(&tasklist_lock); + /* Chances are by this time our victim is sleeping on kswapd. */ + wake_up(&kswapd_done); + /* * Make kswapd go out of the way, so "p" has a good chance of * killing itself before someone else gets the chance to ask @@ -199,12 +203,6 @@ { static unsigned long first, last, count, lastkill; unsigned long now, since; - - /* - * Enough swap space left? Not OOM. - */ - if (nr_swap_pages > 0) - return; now = jiffies; since = now - last; diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Thu Apr 17 15:25:14 2003 +++ b/mm/page_alloc.c Thu Apr 17 15:25:14 2003 @@ -10,6 +10,7 @@ * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per-CPU page pool, Ingo Molnar, Red Hat, 2001, 2002 */ #include @@ -21,12 +22,10 @@ #include #include #include +#include +#include int nr_swap_pages; -int nr_active_pages; -int nr_inactive_pages; -LIST_HEAD(inactive_list); -LIST_HEAD(active_list); pg_data_t *pgdat_list; /* @@ -42,6 +41,8 @@ static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, }; +static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, }; /* * Temporary debugging check. @@ -84,6 +85,7 @@ unsigned long index, page_idx, mask, flags; free_area_t *area; struct page *base; + per_cpu_t *per_cpu; zone_t *zone; /* @@ -98,20 +100,30 @@ if (page->buffers) BUG(); - if (page->mapping) + if (page->mapping) { + printk(KERN_CRIT "Page has mapping still set. This is a serious situation. However if you \n"); + printk(KERN_CRIT "are using the NVidia binary only module please report this bug to \n"); + printk(KERN_CRIT "NVidia and not to the linux kernel mailinglist.\n"); BUG(); + } if (!VALID_PAGE(page)) BUG(); if (PageLocked(page)) BUG(); - if (PageActive(page)) + if (PageActiveAnon(page)) + BUG(); + if (PageActiveCache(page)) + BUG(); + if (PageInactiveDirty(page)) + BUG(); + if (PageInactiveLaundry(page)) + BUG(); + if (PageInactiveClean(page)) + BUG(); + if (page->pte.direct) BUG(); page->flags &= ~((1<flags & PF_FREE_PAGES) - goto local_freelist; - back_local_freelist: - + zone = page_zone(page); mask = (~0UL) << order; @@ -123,7 +135,18 @@ area = zone->free_area + order; - spin_lock_irqsave(&zone->lock, flags); + per_cpu = zone->cpu_pages + smp_processor_id(); + + __save_flags(flags); + __cli(); + if (!order && (per_cpu->nr_pages < per_cpu->max_nr_pages) && (free_high(zone) <= 0)) { + list_add(&page->list, &per_cpu->head); + per_cpu->nr_pages++; + __restore_flags(flags); + return; + } + + spin_lock(&zone->lock); zone->free_pages -= mask; @@ -158,17 +181,6 @@ list_add(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); - return; - - local_freelist: - if (current->nr_local_pages) - goto back_local_freelist; - if (in_interrupt()) - goto back_local_freelist; - - list_add(&page->list, ¤t->local_pages); - page->index = order; - current->nr_local_pages++; } #define MARK_USED(index, order, area) \ @@ -198,13 +210,32 @@ static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); static struct page * rmqueue(zone_t *zone, unsigned int order) { + per_cpu_t *per_cpu = zone->cpu_pages + smp_processor_id(); free_area_t * area = zone->free_area + order; unsigned int curr_order = order; struct list_head *head, *curr; unsigned long flags; struct page *page; + int threshold = 0; + + if (!(current->flags & PF_MEMALLOC)) + threshold = (per_cpu->max_nr_pages / 8); + __save_flags(flags); + __cli(); - spin_lock_irqsave(&zone->lock, flags); + if (!order && (per_cpu->nr_pages>threshold)) { + if (unlikely(list_empty(&per_cpu->head))) + BUG(); + page = list_entry(per_cpu->head.next, struct page, list); + list_del(&page->list); + per_cpu->nr_pages--; + __restore_flags(flags); + + set_page_count(page, 1); + return page; + } + + spin_lock(&zone->lock); do { head = &area->free_list; curr = head->next; @@ -227,10 +258,7 @@ set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); + DEBUG_LRU_PAGE(page); return page; } curr_order++; @@ -249,76 +277,83 @@ } #endif -static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); -static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +/* + * If we are able to directly reclaim pages, we move pages from the + * inactive_clean list onto the free list until the zone has enough + * free pages or until the inactive_clean pages are exhausted. + * If we cannot do this work ourselves, call kswapd. + */ +void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim)); +void fixup_freespace(zone_t * zone, int direct_reclaim) +{ + if (direct_reclaim) { + struct page * page; + do { + if ((page = reclaim_page(zone))) + __free_pages_ok(page, 0); + } while (page && zone->free_pages <= zone->pages_min); + } else + wakeup_kswapd(GFP_ATOMIC); +} + +#define PAGES_KERNEL 0 +#define PAGES_MIN 1 +#define PAGES_LOW 2 +#define PAGES_HIGH 3 + +/* + * This function does the dirty work for __alloc_pages + * and is separated out to keep the code size smaller. + * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) + */ +static struct page * __alloc_pages_limit(zonelist_t *zonelist, + unsigned long order, int limit, int direct_reclaim) { - struct page * page = NULL; - int __freed = 0; + zone_t **zone = zonelist->zones; + unsigned long water_mark = 0; - if (!(gfp_mask & __GFP_WAIT)) - goto out; - if (in_interrupt()) - BUG(); - - current->allocation_order = order; - current->flags |= PF_MEMALLOC | PF_FREE_PAGES; - - __freed = try_to_free_pages_zone(classzone, gfp_mask); - - current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - - if (current->nr_local_pages) { - struct list_head * entry, * local_pages; - struct page * tmp; - int nr_pages; - - local_pages = ¤t->local_pages; - - if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ - entry = local_pages->next; - do { - tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(page_zone(tmp), classzone)) { - list_del(entry); - current->nr_local_pages--; - set_page_count(tmp, 1); - page = tmp; - - if (page->buffers) - BUG(); - if (page->mapping) - BUG(); - if (!VALID_PAGE(page)) - BUG(); - if (PageLocked(page)) - BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); - if (PageDirty(page)) - BUG(); + for (;;) { + zone_t *z = *(zone++); - break; - } - } while ((entry = entry->next) != local_pages); + if (!z) + break; + if (!z->size) + BUG(); + + /* + * We allocate if the number of (free + inactive_clean) + * pages is above the watermark. + */ + switch (limit) { + case PAGES_KERNEL: + water_mark = z->pages_min / 2; + break; + case PAGES_MIN: + water_mark = z->pages_min; + break; + case PAGES_LOW: + water_mark = z->pages_low; + break; + default: + case PAGES_HIGH: + water_mark = z->pages_high; } - nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ - while ((entry = local_pages->prev) != local_pages) { - list_del(entry); - tmp = list_entry(entry, struct page, list); - __free_pages_ok(tmp, tmp->index); - if (!nr_pages--) - BUG(); + if (z->free_pages + z->inactive_clean_pages >= water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; } - current->nr_local_pages = 0; } - out: - *freed = __freed; - return page; + + /* Found nothing. */ + return NULL; } /* @@ -326,100 +361,262 @@ */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; - zone_t **zone, * classzone; + zone_t **zone; + int min, direct_reclaim = 0; struct page * page; - int freed; + /* + * (If anyone calls gfp from interrupts nonatomically then it + * will sooner or later tripped up by a schedule().) + * + * We fall back to lower-level zones if allocation + * in a higher zone fails. + */ + + /* + * Can we take pages directly from the inactive_clean + * list? + */ + if (order == 0 && (gfp_mask & __GFP_WAIT)) + direct_reclaim = 1; + +try_again: + /* + * First, see if we have any zones with lots of free memory. + * + * We allocate free memory first because it doesn't contain + * any data we would want to cache. + */ zone = zonelist->zones; - classzone = *zone; - if (classzone == NULL) + if (!*zone) return NULL; min = 1UL << order; for (;;) { zone_t *z = *(zone++); if (!z) break; + if (!z->size) + BUG(); - min += z->pages_low; + min += z->pages_min; if (z->free_pages > min) { page = rmqueue(z, order); if (page) return page; - } + } else if (z->free_pages < z->pages_min) + fixup_freespace(z, direct_reclaim); } - classzone->need_balance = 1; - mb(); - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + /* + * Next, try to allocate a page from a zone with a HIGH + * amount of (free + inactive_clean) pages. + * + * If there is a lot of activity, inactive_target + * will be high and we'll have a good chance of + * finding a page using the HIGH limit. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); + if (page) + return page; + + /* + * Then try to allocate a page from a zone with more + * than zone->pages_low of (free + inactive_clean) pages. + * + * When the working set is very large and VM activity + * is low, we're most likely to have our allocation + * succeed here. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); + if (page) + return page; + /* + * OK, none of the zones on our zonelist has lots + * of pages free. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. + * + * We'll also help a bit trying to free pages, this + * way statistics will make sure really fast allocators + * are slowed down more than slow allocators and other + * programs in the system shouldn't be impacted as much + * by the hogs. + */ + wakeup_kswapd(gfp_mask); + + /* + * After waking up kswapd, we try to allocate a page + * from any zone which isn't critical yet. + * + * Kswapd should, in most situations, bring the situation + * back to normal in no time. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + if (page) + return page; + + /* + * Kernel allocations can eat a few emergency pages. + * We should be able to run without this, find out why + * the SCSI layer isn't happy ... + */ + if (gfp_mask & __GFP_HIGH) { + page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, direct_reclaim); + if (page) + return page; + } + + /* + * Oh well, we didn't succeed. + */ + if (!(current->flags & PF_MEMALLOC)) { + /* + * Are we dealing with a higher order allocation? + * + * If so, try to defragment some memory. + */ + if (order > 0 && (gfp_mask & __GFP_WAIT)) + goto defragment; + + /* + * If we arrive here, we are really tight on memory. + * Since kswapd didn't succeed in freeing pages for us, + * we need to help it. + * + * Single page allocs loop until the allocation succeeds. + * Multi-page allocs can fail due to memory fragmentation; + * in that case we bail out to prevent infinite loops and + * hanging device drivers ... + * + * Another issue are GFP_NOFS allocations; because they + * do not have __GFP_FS set it's possible we cannot make + * any progress freeing pages, in that case it's better + * to give up than to deadlock the kernel looping here. + * + * NFS: we must yield the CPU (to rpciod) to avoid deadlock. + */ + if (gfp_mask & __GFP_WAIT) { + yield(); + if (!order || free_high(ALL_ZONES) >= 0) { + int progress = try_to_free_pages(gfp_mask); + if (progress || (gfp_mask & __GFP_FS)) + goto try_again; + /* + * Fail if no progress was made and the + * allocation may not be able to block on IO. + */ + return NULL; + } + } + } + + /* + * Final phase: allocate anything we can! + * + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... + */ zone = zonelist->zones; min = 1UL << order; for (;;) { - unsigned long local_min; zone_t *z = *(zone++); + struct page * page = NULL; if (!z) break; - local_min = z->pages_min; - if (!(gfp_mask & __GFP_WAIT)) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * death. + */ + if (direct_reclaim) { + page = reclaim_page(z); + if (page) + return page; + } + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + min += z->pages_min / 4; + if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) { page = rmqueue(z, order); if (page) return page; } } + goto out_failed; - /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { + /* + * Naive "defragmentation" for higher-order allocations. First we + * free the inactive_clean pages to see if we can allocate our + * allocation, then we call page_launder() to clean some dirty + * pages, and last we try once more. + * + * We might want to turn this into something which defragments + * memory based on physical page, simply by looking for unmapped + * pages next to pages on the free list... + */ +defragment: + { + int try_harder = 0; + unsigned int mask = 0; + int numpages; +defragment_again: zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); if (!z) break; + if (!z->size) + continue; - page = rmqueue(z, order); - if (page) - return page; - } - return NULL; - } - - /* Atomic allocations - we can't balance anything */ - if (!(gfp_mask & __GFP_WAIT)) - return NULL; - - page = balance_classzone(classzone, gfp_mask, order, &freed); - if (page) - return page; + /* + * Try to free the zone's inactive laundry pages. + * Nonblocking in the first pass; blocking in the + * second pass, but never on very new IO. + */ + numpages = z->inactive_laundry_pages; + if (try_harder) { + numpages /= 2; + mask = gfp_mask; + } - zone = zonelist->zones; - min = 1UL << order; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; + current->flags |= PF_MEMALLOC; + rebalance_laundry_zone(z, numpages, mask); + current->flags &= ~PF_MEMALLOC; + + while (z->inactive_clean_pages) { + struct page * page; + /* Move one page to the free list. */ + page = reclaim_page(z); + if (!page) + break; + __free_page(page); + /* Try if the allocation succeeds. */ + page = rmqueue(z, order); + if (page) + return page; + } + } - min += z->pages_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* If we can wait for IO to complete, we wait... */ + if (!try_harder && (gfp_mask & __GFP_FS)) { + try_harder = 1; + goto defragment_again; } } - /* Don't let big-order allocations loop */ - if (order > 3) - return NULL; - - /* Yield for kswapd, and try again */ - yield(); - goto rebalance; +out_failed: + /* No luck.. */ +// printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order); + return NULL; } /* @@ -461,18 +658,29 @@ } /* - * Total amount of free (allocatable) RAM: + * These statistics are held in per-zone counters, so we need to loop + * over each zone and read the statistics. We use this silly macro + * so we don't need to duplicate the code for every statistic. + * If you have a better idea on how to implement this (cut'n'paste + * isn't considered better), please let me know - Rik */ -unsigned int nr_free_pages (void) -{ - unsigned int sum = 0; - zone_t *zone; - - for_each_zone(zone) - sum += zone->free_pages; +#define NR_FOO_PAGES(__function_name, __stat) \ + unsigned int __function_name (void) \ + { \ + unsigned int sum = 0; \ + zone_t *zone; \ + \ + for_each_zone(zone) \ + sum += zone->__stat; \ + return sum; \ + } - return sum; -} +NR_FOO_PAGES(nr_free_pages, free_pages) +NR_FOO_PAGES(nr_active_anon_pages, active_anon_pages) +NR_FOO_PAGES(nr_active_cache_pages, active_cache_pages) +NR_FOO_PAGES(nr_inactive_dirty_pages, inactive_dirty_pages) +NR_FOO_PAGES(nr_inactive_laundry_pages, inactive_laundry_pages) +NR_FOO_PAGES(nr_inactive_clean_pages, inactive_clean_pages) /* * Amount of free RAM allocatable as buffer memory: @@ -488,10 +696,10 @@ zone_t *zone; for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->size; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; + sum += zone->free_pages; + sum += zone->inactive_clean_pages; + sum += zone->inactive_laundry_pages; + sum += zone->inactive_dirty_pages; } } @@ -543,10 +751,16 @@ tmpdat = tmpdat->node_next; } - printk("( Active: %d, inactive: %d, free: %d )\n", - nr_active_pages, - nr_inactive_pages, - nr_free_pages()); + printk("Free pages: %6dkB (%6dkB HighMem)\n", + nr_free_pages() << (PAGE_SHIFT-10), + nr_free_highpages() << (PAGE_SHIFT-10)); + + printk("( Active: %d/%d, inactive_laundry: %d, inactive_clean: %d, free: %d )\n", + nr_active_anon_pages() + nr_active_cache_pages(), + nr_inactive_dirty_pages(), + nr_inactive_laundry_pages(), + nr_inactive_clean_pages(), + nr_free_pages()); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; @@ -681,6 +895,7 @@ * - mark all memory queues empty * - clear the memory bitmaps */ +extern unsigned int kswapd_minfree; void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, unsigned long *zones_size, unsigned long zone_start_paddr, unsigned long *zholes_size, struct page *lmem_map) @@ -726,8 +941,9 @@ offset = lmem_map - mem_map; for (j = 0; j < MAX_NR_ZONES; j++) { + int k; zone_t *zone = pgdat->node_zones + j; - unsigned long mask; + unsigned long mask, extrafree = 0; unsigned long size, realsize; zone_table[nid * MAX_NR_ZONES + j] = zone; @@ -738,10 +954,45 @@ printk("zone(%lu): %lu pages.\n", j, size); zone->size = size; zone->name = zone_names[j]; + + for (k = 0; k < NR_CPUS; k++) { + per_cpu_t *per_cpu = zone->cpu_pages + k; + + INIT_LIST_HEAD(&per_cpu->head); + per_cpu->nr_pages = 0; + per_cpu->max_nr_pages = realsize / smp_num_cpus / 128; + if (per_cpu->max_nr_pages > MAX_PER_CPU_PAGES) + per_cpu->max_nr_pages = MAX_PER_CPU_PAGES; + else if (!per_cpu->max_nr_pages) + per_cpu->max_nr_pages = 1; + } zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; + zone->active_anon_pages = 0; + zone->active_cache_pages = 0; + zone->inactive_clean_pages = 0; + zone->inactive_laundry_pages = 0; + zone->inactive_dirty_pages = 0; zone->need_balance = 0; + zone->need_scan = 0; + zone->age_interval = HZ; + zone->age_next = jiffies; + for (k = 0; k <= MAX_AGE ; k++) { + INIT_LIST_HEAD(&zone->active_anon_list[k]); + zone->active_anon_count[k] = 0; + } + for (k = 0; k <= MAX_AGE ; k++) { + INIT_LIST_HEAD(&zone->active_cache_list[k]); + zone->active_cache_count[k] = 0; + } + zone->cache_age_bias = 0; + zone->anon_age_bias = 0; + INIT_LIST_HEAD(&zone->inactive_dirty_list); + INIT_LIST_HEAD(&zone->inactive_laundry_list); + INIT_LIST_HEAD(&zone->inactive_clean_list); + spin_lock_init(&zone->lru_lock); + if (!size) continue; @@ -761,21 +1012,30 @@ pgdat->nr_zones = j+1; + /* + * On large memory machines we keep extra memory + * free for kernel allocations. + */ + if (zone_extrafree_ratio[j]) + extrafree = min_t(int, (realtotalpages / zone_extrafree_ratio[j]), zone_extrafree_max[j]); + if (extrafree < zone_balance_max[j]) + extrafree = 0; + mask = (realsize / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; - else if (mask > zone_balance_max[j]) - mask = zone_balance_max[j]; - zone->pages_min = mask; - zone->pages_low = mask*2; - zone->pages_high = mask*3; - + zone->pages_min = extrafree + min(mask, (unsigned long)zone_balance_max[j]); + zone->pages_low = extrafree + mask*2; + zone->pages_high = extrafree + mask*3; + zone->pages_plenty = extrafree + mask*6; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) printk("BUG: wrong zone alignment, it will crash\n"); + + kswapd_minfree += zone->pages_min; /* * Initially all pages are reserved - free ones are freed diff -Nru a/mm/rmap.c b/mm/rmap.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/rmap.c Thu Apr 17 15:25:14 2003 @@ -0,0 +1,545 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * + * Simple, low overhead pte-based reverse mapping scheme. + * This is kept modular because we may want to experiment + * with object-based reverse mapping schemes. Please try + * to keep this thing as modular as possible. + */ + +/* + * Locking: + * - the page->pte_chain is protected by the PG_chainlock bit, + * which nests within the zone lru_lock, then the + * - the page->pte.chain is protected by the PG_chainlock bit, + * which nests within the lru lock, then the + * mm->page_table_lock, and then the page lock. + * - because swapout locking is opposite to the locking order + * in the page fault path, the swapout path uses trylocks + * on the mm->page_table_lock + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* #define DEBUG_RMAP */ + +/* + * Shared pages have a chain of pte_chain structures, used to locate + * all the mappings to this page. We only need a pointer to the pte + * here, the page struct for the page table page contains the process + * it belongs to and the offset within that process. + * + * We use an array of pte pointers in this structure to minimise cache + * misses while traversing reverse maps. + */ +#define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t)) + +struct pte_chain { + struct pte_chain *next; + pte_addr_t ptes[NRPTE]; +} ____cacheline_aligned; + +static kmem_cache_t *pte_chain_cache; + +/* + * pte_chain list management policy: + * + * - If a page has a pte_chain list then it is shared by at least two + * processes, or by a process which has recently done a fork+exec, + * because a single sharing uses PageDirect. + * - The pageout code collapses pte_chains with a single user back into + * PageDirect pointers. This is done lazily so a process can do a number + * of fork+exec sequences without having to allocate and free pte_chains. + * - A pte_chain list has free space only in the head member - all succeeding + * members are 100% full. + * - If the head element has free space, it occurs in its leading slots. + * - All free space in the pte_chain is at the start of the head member. + * - Insertion into the pte_chain puts a pte pointer in the last free slot + * of the head member. + * - Removal from a pte chain moves the head pte of the head member onto the + * victim pte and frees the head member if it became empty. + */ + +/** + * pte_chain_alloc - allocate a pte_chain struct + * @gfp_flags: allocation flags + * + * Returns a pointer to a fresh pte_chain structure. Allocates new + * pte_chain structures as required. + * Caller needs to hold the page's pte_chain_lock. + */ +struct pte_chain * pte_chain_alloc(int gfp_flags) +{ + struct pte_chain * pte_chain; + + pte_chain = kmem_cache_alloc(pte_chain_cache, gfp_flags); +#ifdef DEBUG_RMAP + { + int i; + for (i = 0; i < NRPTE; i++) + BUG_ON(pte_chain->ptes[i]); + BUG_ON(pte_chain->next); + } +#endif + return pte_chain; +} + +/** + * __pte_chain_free - free pte_chain structure + * @pte_chain: pte_chain struct to free + */ +void __pte_chain_free(struct pte_chain *pte_chain) +{ + pte_chain->next = NULL; + kmem_cache_free(pte_chain_cache, pte_chain); +} + +/** + ** VM stuff below this comment + **/ + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * @rsslimit: place to put whether the page is over RSS limit + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * In addition to this it checks if the processes holding the + * page are over or under their RSS limit. + * Caller needs to hold the pte_chain_lock. + * + * If the page has a single-entry pte_chain, collapse that back to a + * PageDirect representation. This way, it's only done under memory + * pressure, giving a slight speedup to fork+exec for active forkers. + */ +int page_referenced(struct page * page, int * rsslimit) +{ + int referenced = 0, under_rsslimit = 0; + struct mm_struct * mm; + struct pte_chain * pc; + + if (PageTestandClearReferenced(page)) + referenced++; + + if (PageDirect(page)) { + pte_t *pte = rmap_ptep_map(page->pte.direct); + if (ptep_test_and_clear_young(pte)) + referenced++; + + mm = ptep_to_mm(pte); + if (mm->rss < mm->rlimit_rss) + under_rsslimit++; + rmap_ptep_unmap(pte); + } else { + int nr_chains = 0; + + /* Check all the page tables mapping this page. */ + for (pc = page->pte.chain; pc; pc = pc->next) { + int i; + + for (i = NRPTE-1; i >= 0; i--) { + pte_addr_t pte_paddr = pc->ptes[i]; + pte_t *pte; + + if (!pte_paddr) + break; + pte = rmap_ptep_map(pte_paddr); + if (ptep_test_and_clear_young(pte)) + referenced++; + mm = ptep_to_mm(pte); + if (mm->rss < mm->rlimit_rss) + under_rsslimit++; + rmap_ptep_unmap(pte); + nr_chains++; + } + } + if (nr_chains == 1) { + pc = page->pte.chain; + page->pte.direct = pc->ptes[NRPTE-1]; + SetPageDirect(page); + pc->ptes[NRPTE-1] = 0; + __pte_chain_free(pc); + } + } + + /* + * We're only over the RSS limit if all the processes sharing the + * page are. + */ + *rsslimit = !under_rsslimit; + + return referenced; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @ptep: the page table entry mapping this page + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +struct pte_chain * +page_add_rmap(struct page * page, pte_t * ptep, struct pte_chain * pte_chain) +{ + pte_addr_t pte_paddr = ptep_to_paddr(ptep); + struct pte_chain * cur_pte_chain; + int i; + +#ifdef DEBUG_RMAP + if (!page || !ptep) + BUG(); + if (!pte_present(*ptep)) + BUG(); + if (!ptep_to_mm(ptep)) + BUG(); +#endif + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return pte_chain; + + pte_chain_lock(page); + +#ifdef DEBUG_RMAP + /* + * This stuff needs help to get up to highmem speed. + */ + { + struct pte_chain * pc; + if (PageDirect(page)) { + if (page->pte.direct == pte_paddr) + BUG(); + } else { + for (pc = page->pte.chain; pc; pc = pc->next) { + for (i = 0; i < NRPTE; i++) { + pte_addr_t pte = pc->ptes[i]; + + if (pte && pte == pte_paddr) + BUG(); + } + } + } + } +#endif + + if (page->pte.direct == 0) { + page->pte.direct = pte_paddr; + SetPageDirect(page); + goto out; + } + + if (PageDirect(page)) { + /* Convert a direct pointer into a pte_chain */ + ClearPageDirect(page); + pte_chain->ptes[NRPTE-1] = page->pte.direct; + pte_chain->ptes[NRPTE-2] = pte_paddr; + page->pte.direct = 0; + page->pte.chain = pte_chain; + pte_chain = NULL; /* We consumed it */ + goto out; + } + + cur_pte_chain = page->pte.chain; + if (cur_pte_chain->ptes[0]) { /* It's full */ + pte_chain->next = cur_pte_chain; + page->pte.chain = pte_chain; + pte_chain->ptes[NRPTE-1] = pte_paddr; + pte_chain = NULL; /* We consumed it */ + goto out; + } + + BUG_ON(!cur_pte_chain->ptes[NRPTE-1]); + + for (i = NRPTE-2; i >= 0; i--) { + if (!cur_pte_chain->ptes[i]) { + cur_pte_chain->ptes[i] = pte_paddr; + goto out; + } + } + BUG(); +out: + pte_chain_unlock(page); + return pte_chain; +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * @ptep: page table entry to remove + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + * Caller needs to hold the mm->page_table_lock. + */ +void page_remove_rmap(struct page * page, pte_t * ptep) +{ + pte_addr_t pte_paddr = ptep_to_paddr(ptep); + struct pte_chain *pc; + + if (!page || !ptep) + BUG(); + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + if (!page_mapped(page)) + return; /* remap_page_range() from a driver? */ + + pte_chain_lock(page); + + if (PageDirect(page)) { + if (page->pte.direct == pte_paddr) { + page->pte.direct = 0; + ClearPageDirect(page); + goto out; + } + } else { + struct pte_chain *start = page->pte.chain; + int victim_i = -1; + + for (pc = start; pc; pc = pc->next) { + int i; + + if (pc->next) + prefetch(pc->next); + for (i = 0; i < NRPTE; i++) { + pte_addr_t pa = pc->ptes[i]; + + if (!pa) + continue; + if (victim_i == -1) + victim_i = i; + if (pa != pte_paddr) + continue; + pc->ptes[i] = start->ptes[victim_i]; + start->ptes[victim_i] = 0; + if (victim_i == NRPTE-1) { + /* Emptied a pte_chain */ + page->pte.chain = start->next; + __pte_chain_free(start); + } + goto out; + } + } + } +#ifdef DEBUG_RMAP + /* Not found. This should NEVER happen! */ + printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep); + printk(KERN_ERR "page_remove_rmap: only found: "); + if (PageDirect(page)) { + printk("%llx", (u64)page->pte.direct); + } else { + for (pc = page->pte.chain; pc; pc = pc->next) { + int i; + for (i = 0; i < NRPTE; i++) + printk(" %d:%llx", i, (u64)pc->ptes[i]); + } + } + printk("\n"); + printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n"); +#endif + +out: + pte_chain_unlock(page); + return; +} + +/** + * try_to_unmap_one - worker function for try_to_unmap + * @page: page to unmap + * @ptep: page table entry to unmap from page + * + * Internal helper function for try_to_unmap, called for each page + * table entry mapping a page. Because locking order here is opposite + * to the locking order used by the page fault path, we use trylocks. + * Locking: + * lru lock page_launder() + * page lock page_launder(), trylock + * pte_chain_lock page_launder() + * mm->page_table_lock try_to_unmap_one(), trylock + */ +static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); +static int try_to_unmap_one(struct page * page, pte_addr_t paddr) +{ + pte_t *ptep = rmap_ptep_map(paddr); + unsigned long address = ptep_to_address(ptep); + struct mm_struct * mm = ptep_to_mm(ptep); + struct vm_area_struct * vma; + pte_t pte; + int ret; + + if (!mm) + BUG(); + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) { + rmap_ptep_unmap(ptep); + return SWAP_AGAIN; + } + + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* Nuke the page table entry. */ + pte = ptep_get_and_clear(ptep); + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + + /* Store the swap location in the pte. See handle_pte_fault() ... */ + if (PageSwapCache(page)) { + swp_entry_t entry = { .val = page->index }; + swap_duplicate(entry); + set_pte(ptep, swp_entry_to_pte(entry)); + } + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pte)) + set_page_dirty(page); + + mm->rss--; + page_cache_release(page); + ret = SWAP_SUCCESS; + +out_unlock: + rmap_ptep_unmap(ptep); + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the zone lru lock + * and the page lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + * SWAP_ERROR - an error occurred + */ +int try_to_unmap(struct page * page) +{ + struct pte_chain *pc, *next_pc, *start; + int ret = SWAP_SUCCESS; + int victim_i = -1; + + /* This page should not be on the pageout lists. */ + if (PageReserved(page)) + BUG(); + if (!PageLocked(page)) + BUG(); + /* We need backing store to swap out a page. */ + if (!page->mapping) + BUG(); + + if (PageDirect(page)) { + ret = try_to_unmap_one(page, page->pte.direct); + if (ret == SWAP_SUCCESS) { + page->pte.direct = 0; + ClearPageDirect(page); + } + goto out; + } + + start = page->pte.chain; + for (pc = start; pc; pc = next_pc) { + int i; + + next_pc = pc->next; + if (next_pc) + prefetch(next_pc); + for (i = 0; i < NRPTE; i++) { + pte_addr_t pte_paddr = pc->ptes[i]; + + if (!pte_paddr) + continue; + if (victim_i == -1) + victim_i = i; + + switch (try_to_unmap_one(page, pte_paddr)) { + case SWAP_SUCCESS: + /* + * Release a slot. If we're releasing the + * first pte in the first pte_chain then + * pc->ptes[i] and start->ptes[victim_i] both + * refer to the same thing. It works out. + */ + pc->ptes[i] = start->ptes[victim_i]; + start->ptes[victim_i] = 0; + victim_i++; + if (victim_i == NRPTE) { + page->pte.chain = start->next; + __pte_chain_free(start); + start = page->pte.chain; + victim_i = 0; + } + break; + case SWAP_AGAIN: + /* Skip this pte, remembering status. */ + ret = SWAP_AGAIN; + continue; + case SWAP_FAIL: + ret = SWAP_FAIL; + goto out; + case SWAP_ERROR: + ret = SWAP_ERROR; + goto out; + } + } + } +out: + return ret; +} + +/** + ** No more VM stuff below this comment, only pte_chain helper + ** functions. + **/ + +static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) +{ + struct pte_chain *pc = p; + + memset(pc, 0, sizeof(*pc)); +} + +void __init pte_chain_init(void) +{ + pte_chain_cache = kmem_cache_create( "pte_chain", + sizeof(struct pte_chain), + 0, + 0, + pte_chain_ctor, + NULL); + + if (!pte_chain_cache) + panic("failed to create pte_chain cache!\n"); +} diff -Nru a/mm/swap.c b/mm/swap.c --- a/mm/swap.c Thu Apr 17 15:25:14 2003 +++ b/mm/swap.c Thu Apr 17 15:25:14 2003 @@ -15,10 +15,10 @@ #include #include -#include #include #include #include +#include #include #include /* for copy_to/from_user */ @@ -33,22 +33,148 @@ 8, /* do swap I/O in clusters of this size */ }; +/** + * (de)activate_page - move pages from/to active and inactive lists + * @page: the page we want to move + * + * Deactivate_page will move an active page to the right + * inactive list, while activate_page will move a page back + * from one of the inactive lists to the active list. If + * called on a page which is not on any of the lists, the + * page is left alone. + */ +void deactivate_page_nolock(struct page * page) +{ + /* + * Don't touch it if it's not on the active list. + * (some pages aren't on any list at all) + */ + ClearPageReferenced(page); + if (PageActiveAnon(page)) { + del_page_from_active_anon_list(page); + add_page_to_inactive_dirty_list(page); + } else if (PageActiveCache(page)) { + del_page_from_active_cache_list(page); + add_page_to_inactive_dirty_list(page); + } +} + +void deactivate_page(struct page * page) +{ + lru_lock(page_zone(page)); + deactivate_page_nolock(page); + lru_unlock(page_zone(page)); +} + +/** + * drop_page - like deactivate_page, but try inactive_clean list + * @page: the page to drop + * + * Try to move a page to the inactive_clean list, this succeeds if the + * page is clean and not in use by anybody. If the page cannot be placed + * on the inactive_clean list it is placed on the inactive_dirty list + * instead. + * + * Note: this function gets called with the lru lock held. + */ +void drop_page_zone(struct zone_struct *zone, struct page * page) +{ + if (!TryLockPage(page)) { + if (page->mapping && page->buffers) { + page_cache_get(page); + lru_unlock(zone); + try_to_release_page(page, GFP_NOIO); + page_cache_release(page); + lru_lock(zone); + } + UnlockPage(page); + } + + /* Make sure the page really is reclaimable. */ + pte_chain_lock(page); + if (!page->mapping || PageDirty(page) || page->pte.direct || + page->buffers || page_count(page) > 1) + deactivate_page_nolock(page); + + else if (page_count(page) == 1) { + ClearPageReferenced(page); + if (PageActiveAnon(page)) { + del_page_from_active_anon_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageActiveCache(page)) { + del_page_from_active_cache_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageInactiveLaundry(page)) { + del_page_from_inactive_laundry_list(page); + add_page_to_inactive_clean_list(page); + } + } + pte_chain_unlock(page); +} + +void drop_page(struct page * page) +{ + if (!TryLockPage(page)) { + if (page->mapping && page->buffers) { + page_cache_get(page); + lru_unlock(ALL_ZONES); + try_to_release_page(page, GFP_NOIO); + page_cache_release(page); + lru_lock(ALL_ZONES); + } + UnlockPage(page); + } + + /* Make sure the page really is reclaimable. */ + pte_chain_lock(page); + if (!page->mapping || PageDirty(page) || page->pte.direct || + page->buffers || page_count(page) > 1) + deactivate_page_nolock(page); + + else if (page_count(page) == 1) { + ClearPageReferenced(page); + if (PageActiveAnon(page)) { + del_page_from_active_anon_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageActiveCache(page)) { + del_page_from_active_cache_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageInactiveLaundry(page)) { + del_page_from_inactive_laundry_list(page); + add_page_to_inactive_clean_list(page); + } + } + pte_chain_unlock(page); +} + /* * Move an inactive page to the active list. */ -static inline void activate_page_nolock(struct page * page) +void activate_page_nolock(struct page * page) { - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(page); - add_page_to_active_list(page); + if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page, INITIAL_AGE); + } else if (PageInactiveLaundry(page)) { + del_page_from_inactive_laundry_list(page); + add_page_to_active_list(page, INITIAL_AGE); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); + add_page_to_active_list(page, INITIAL_AGE); } } void activate_page(struct page * page) { - spin_lock(&pagemap_lru_lock); + lru_lock(page_zone(page)); activate_page_nolock(page); - spin_unlock(&pagemap_lru_lock); + lru_unlock(page_zone(page)); } /** @@ -58,10 +184,10 @@ void lru_cache_add(struct page * page) { if (!PageLRU(page)) { - spin_lock(&pagemap_lru_lock); - if (!TestSetPageLRU(page)) - add_page_to_inactive_list(page); - spin_unlock(&pagemap_lru_lock); + lru_lock(page_zone(page)); + if (!TestandSetPageLRU(page)) + add_page_to_active_list(page, INITIAL_AGE); + lru_unlock(page_zone(page)); } } @@ -70,17 +196,22 @@ * @page: the page to add * * This function is for when the caller already holds - * the pagemap_lru_lock. + * the lru lock. */ void __lru_cache_del(struct page * page) { - if (TestClearPageLRU(page)) { - if (PageActive(page)) { - del_page_from_active_list(page); - } else { - del_page_from_inactive_list(page); - } + if (PageActiveAnon(page)) { + del_page_from_active_anon_list(page); + } else if (PageActiveCache(page)) { + del_page_from_active_cache_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + } else if (PageInactiveLaundry(page)) { + del_page_from_inactive_laundry_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); } + ClearPageLRU(page); } /** @@ -89,9 +220,9 @@ */ void lru_cache_del(struct page * page) { - spin_lock(&pagemap_lru_lock); + lru_lock(page_zone(page)); __lru_cache_del(page); - spin_unlock(&pagemap_lru_lock); + lru_unlock(page_zone(page)); } /* diff -Nru a/mm/swap_state.c b/mm/swap_state.c --- a/mm/swap_state.c Thu Apr 17 15:25:14 2003 +++ b/mm/swap_state.c Thu Apr 17 15:25:14 2003 @@ -89,6 +89,40 @@ return 0; } +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page * page) +{ + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + + for (;;) { + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Add it to the swap cache and mark it dirty + * (adding to the page cache will clear the dirty + * and uptodate bits, so we need to do it again) + */ + if (add_to_swap_cache(page, entry) == 0) { + SetPageUptodate(page); + set_page_dirty(page); + swap_free(entry); + return 1; + } + /* Raced with "speculative" read_swap_cache_async */ + swap_free(entry); + } +} + /* * This must be called only on pages that have * been verified to be in the swap cache. diff -Nru a/mm/swapfile.c b/mm/swapfile.c --- a/mm/swapfile.c Thu Apr 17 15:25:14 2003 +++ b/mm/swapfile.c Thu Apr 17 15:25:14 2003 @@ -362,8 +362,9 @@ * what to do if a write is requested later. */ /* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, - pte_t *dir, swp_entry_t entry, struct page* page) +static void +unuse_pte(struct vm_area_struct * vma, unsigned long address, pte_t * dir, + swp_entry_t entry, struct page * page, struct pte_chain ** pte_chainp) { pte_t pte = *dir; @@ -373,6 +374,7 @@ return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); ++vma->vm_mm->rss; } @@ -382,7 +384,8 @@ unsigned long address, unsigned long size, unsigned long offset, swp_entry_t entry, struct page* page) { - pte_t * pte; + struct pte_chain * pte_chain = NULL; + pte_t *pte, *mapping; unsigned long end; if (pmd_none(*dir)) @@ -392,17 +395,25 @@ pmd_clear(dir); return; } - pte = pte_offset(dir, address); + mapping = pte = pte_offset_map(dir, address); offset += address & PMD_MASK; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; do { - unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); + /* + * FIXME: handle pte_chain_alloc() failures + */ + if (pte_chain == NULL) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + unuse_pte(vma, offset+address-vma->vm_start, + pte, entry, page, &pte_chain); address += PAGE_SIZE; pte++; } while (address && (address < end)); + pte_unmap(mapping); + pte_chain_free(pte_chain); } /* mmlist_lock and vma->vm_mm->page_table_lock are held */ diff -Nru a/mm/vmalloc.c b/mm/vmalloc.c --- a/mm/vmalloc.c Thu Apr 17 15:25:14 2003 +++ b/mm/vmalloc.c Thu Apr 17 15:25:14 2003 @@ -31,7 +31,7 @@ pmd_clear(pmd); return; } - pte = pte_offset(pmd, address); + pte = pte_offset_kernel(pmd, address); address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -126,7 +126,7 @@ if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; if (alloc_area_pte(pte, address, end - address, gfp_mask, prot)) diff -Nru a/mm/vmscan.c b/mm/vmscan.c --- a/mm/vmscan.c Thu Apr 17 15:25:14 2003 +++ b/mm/vmscan.c Thu Apr 17 15:25:14 2003 @@ -12,6 +12,7 @@ * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel. + * O(1) rmap vm, Arjan van de ven */ #include @@ -23,9 +24,12 @@ #include #include #include +#include #include +static void refill_freelist(void); +static void wakeup_memwaiters(void); /* * The "priority" of VM scanning is how much of the queues we * will scan in one go. A value of 6 for DEF_PRIORITY implies @@ -34,674 +38,926 @@ */ #define DEF_PRIORITY (6) -/* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). - * It returns zero if it couldn't do anything, - * - * rss may decrease because pages are shared, but this - * doesn't count as having freed a page. - */ - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) +static inline void age_page_up_nolock(struct page *page, int old_age) { - pte_t pte; - swp_entry_t entry; + int new_age; + + new_age = old_age+4; + if (new_age < 0) + new_age = 0; + if (new_age > MAX_AGE) + new_age = MAX_AGE; + + if (PageActiveAnon(page)) { + del_page_from_active_anon_list(page); + add_page_to_active_anon_list(page, new_age); + } else if (PageActiveCache(page)) { + del_page_from_active_cache_list(page); + add_page_to_active_cache_list(page, new_age); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page, new_age); + } else if (PageInactiveLaundry(page)) { + del_page_from_inactive_laundry_list(page); + add_page_to_active_list(page, new_age); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); + add_page_to_active_list(page, new_age); + } else return; - /* Don't look at this pte if it's been accessed recently. */ - if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { - mark_page_accessed(page); - return 0; - } +} - /* Don't bother unmapping pages that are active */ - if (PageActive(page)) - return 0; - /* Don't bother replenishing zones not under pressure.. */ - if (!memclass(page_zone(page), classzone)) - return 0; +/* Must be called with page's pte_chain_lock held. */ +static inline int page_mapping_inuse(struct page * page) +{ + struct address_space * mapping = page->mapping; + + /* Page is in somebody's page tables. */ + if (page->pte.direct) + return 1; - if (TryLockPage(page)) + /* XXX: does this happen ? */ + if (!mapping) return 0; - /* From this point on, the odds are that we're going to - * nuke this pte, so read and clear the pte. This hook - * is needed on CPUs which update the accessed and dirty - * bits in hardware. - */ - flush_cache_page(vma, address); - pte = ptep_get_and_clear(page_table); - flush_tlb_page(vma, address); - - if (pte_dirty(pte)) - set_page_dirty(page); - - /* - * Is the page already in the swap cache? If so, then - * we can just drop our reference to it without doing - * any IO - it's already up-to-date on disk. - */ - if (PageSwapCache(page)) { - entry.val = page->index; - swap_duplicate(entry); -set_swap_pte: - set_pte(page_table, swp_entry_to_pte(entry)); -drop_pte: - mm->rss--; - UnlockPage(page); - { - int freeable = page_count(page) - !!page->buffers <= 2; - page_cache_release(page); - return freeable; - } - } + /* File is mmaped by somebody. */ + if (mapping->i_mmap || mapping->i_mmap_shared) + return 1; - /* - * Is it a clean page? Then it must be recoverable - * by just paging it in again, and we can just drop - * it.. or if it's dirty but has backing store, - * just mark the page dirty and drop it. - * - * However, this won't actually free any real - * memory, as the page will just be in the page cache - * somewhere, and as such we should just continue - * our scan. - * - * Basically, this just makes it possible for us to do - * some real work in the future in "refill_inactive()". - */ - if (page->mapping) - goto drop_pte; - if (!PageDirty(page)) - goto drop_pte; + return 0; +} - /* - * Anonymous buffercache pages can be left behind by - * concurrent truncate and pagefault. - */ - if (page->buffers) - goto preserve; +/** + * reclaim_page - reclaims one page from the inactive_clean list + * @zone: reclaim a page from this zone + * + * The pages on the inactive_clean can be instantly reclaimed. + * The tests look impressive, but most of the time we'll grab + * the first page of the list and exit successfully. + */ +struct page * reclaim_page(zone_t * zone) +{ + struct page * page = NULL; + struct list_head * page_lru; + swp_entry_t entry = {0}; + int maxscan; /* - * This is a dirty, swappable page. First of all, - * get a suitable swap entry for it, and make sure - * we have the swap cache set up to associate the - * page with that swap entry. + * We need to hold the pagecache_lock around all tests to make sure + * reclaim_page() doesn't race with other pagecache users */ - for (;;) { - entry = get_swap_page(); - if (!entry.val) - break; - /* Add it to the swap cache and mark it dirty - * (adding to the page cache will clear the dirty - * and uptodate bits, so we need to do it again) - */ - if (add_to_swap_cache(page, entry) == 0) { - SetPageUptodate(page); - set_page_dirty(page); - goto set_swap_pte; + lru_lock(zone); + spin_lock(&pagecache_lock); + maxscan = zone->inactive_clean_pages; + while (maxscan-- && !list_empty(&zone->inactive_clean_list)) { + page_lru = zone->inactive_clean_list.prev; + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + BUG_ON(unlikely(!PageInactiveClean(page))); + + /* Page is being freed */ + if (unlikely(page_count(page)) == 0) { + list_del(page_lru); + list_add(page_lru, &zone->inactive_clean_list); + continue; } - /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); - } - - /* No swap space left */ -preserve: - set_pte(page_table, pte); - UnlockPage(page); - return 0; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pte_t * pte; - unsigned long pmd_end; - if (pmd_none(*dir)) - return count; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return count; - } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; + /* Page cannot be reclaimed ? Move to inactive_dirty list. */ + pte_chain_lock(page); + if (unlikely(page->pte.direct || page->buffers || + PageReferenced(page) || PageDirty(page) || + page_count(page) > 1 || TryLockPage(page))) { + del_page_from_inactive_clean_list(page); + add_page_to_inactive_dirty_list(page); + pte_chain_unlock(page); + continue; + } - do { - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); + /* + * From here until reaching either the bottom of the loop + * or found_page: the pte_chain_lock is held. + */ - if (VALID_PAGE(page) && !PageReserved(page)) { - count -= try_to_swap_out(mm, vma, address, pte, page, classzone); - if (!count) { - address += PAGE_SIZE; - break; - } - } + /* OK, remove the page from the caches. */ + if (PageSwapCache(page)) { + entry.val = page->index; + __delete_from_swap_cache(page); + goto found_page; } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); - mm->swap_address = address; - return count; -} -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pmd_t * pmd; - unsigned long pgd_end; + if (page->mapping) { + __remove_inode_page(page); + goto found_page; + } - if (pgd_none(*dir)) - return count; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return count; + /* We should never ever get here. */ + printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); + list_del(page_lru); + zone->inactive_clean_pages--; + pte_chain_unlock(page); + UnlockPage(page); } + spin_unlock(&pagecache_lock); + lru_unlock(zone); + return NULL; - pmd = pmd_offset(dir, address); - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (pgd_end && (end > pgd_end)) - end = pgd_end; - - do { - count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); - if (!count) - break; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return count; +found_page: + __lru_cache_del(page); + pte_chain_unlock(page); + spin_unlock(&pagecache_lock); + lru_unlock(zone); + if (entry.val) + swap_free(entry); + UnlockPage(page); + if (page_count(page) != 1) + printk("VM: reclaim_page, found page with count %d!\n", + page_count(page)); + return page; } -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) +/** + * need_rebalance_dirty - do we need to write inactive stuff to disk? + * @zone: the zone in question + * + * Returns true if the zone in question has an inbalance between inactive + * dirty on one side and inactive laundry + inactive clean on the other + * Right now set the balance at 50%; may need tuning later on + */ +static inline int need_rebalance_dirty(zone_t * zone) { - pgd_t *pgdir; - unsigned long end; - - /* Don't swap out areas which are reserved */ - if (vma->vm_flags & VM_RESERVED) - return count; + if (zone->inactive_dirty_pages > zone->inactive_laundry_pages + zone->inactive_clean_pages) + return 1; - pgdir = pgd_offset(mm, address); - - end = vma->vm_end; - BUG_ON(address >= end); - do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); - if (!count) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (address && (address < end)); - return count; + return 0; } -/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ -struct mm_struct *swap_mm = &init_mm; +/** + * need_rebalance_laundry - does the zone have too few inactive_clean pages? + * @zone: the zone in question + * + * Returns true if the zone in question has too few pages in inactive clean + * + free + */ +static inline int need_rebalance_laundry(zone_t * zone) +{ + if (free_low(zone) >= 0) + return 1; + return 0; +} -/* - * Returns remaining count of pages to be swapped out by followup call. +/** + * launder_page - clean dirty page, move to inactive_laundry list + * @zone: zone to free pages in + * @gfp_mask: what operations we are allowed to do + * @page: the page at hand, must be on the inactive dirty list + * + * per-zone lru lock is assumed to be held, but this function can drop + * it and sleep, so no other locks are allowed to be held. + * + * returns 0 for failure; 1 for success */ -static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) +int launder_page(zone_t * zone, int gfp_mask, struct page *page) { - unsigned long address; - struct vm_area_struct* vma; + int over_rsslimit; + + /* + * Page is being freed, don't worry about it, but report progress. + */ + if (unlikely(page_count(page)) == 0) + return 1; + BUG_ON(!PageInactiveDirty(page)); + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_laundry_list(page); + /* store the time we start IO */ + page->age = (jiffies/HZ)&255; /* - * Find the proper vm-area after freezing the vma chain - * and ptes. + * The page is locked. IO in progress? + * If so, move to laundry and report progress + * Acquire PG_locked early in order to safely + * access page->mapping. */ - spin_lock(&mm->page_table_lock); - address = mm->swap_address; - if (address == TASK_SIZE || swap_mm != mm) { - /* We raced: don't count this mm but try again */ - ++*mmcounter; - goto out_unlock; + if (unlikely(TryLockPage(page))) { + return 1; } - vma = find_vma(mm, address); - if (vma) { - if (address < vma->vm_start) - address = vma->vm_start; - for (;;) { - count = swap_out_vma(mm, vma, address, count, classzone); - vma = vma->vm_next; - if (!vma) - break; - if (!count) - goto out_unlock; - address = vma->vm_start; + /* + * The page is in active use or really unfreeable. Move to + * the active list and adjust the page age if needed. + */ + pte_chain_lock(page); + if (page_referenced(page, &over_rsslimit) && !over_rsslimit && + page_mapping_inuse(page)) { + del_page_from_inactive_laundry_list(page); + add_page_to_active_list(page, INITIAL_AGE); + pte_chain_unlock(page); + UnlockPage(page); + return 1; + } + + /* + * Anonymous process memory without backing store. Try to + * allocate it some swap space here. + * + * XXX: implement swap clustering ? + */ + if (page->pte.direct && !page->mapping && !page->buffers) { + page_cache_get(page); + pte_chain_unlock(page); + lru_unlock(zone); + if (!add_to_swap(page)) { + activate_page(page); + UnlockPage(page); + page_cache_release(page); + lru_lock(zone); + return 0; } + page_cache_release(page); + lru_lock(zone); + /* Note: may be on another list ! */ + if (!PageInactiveLaundry(page)) { + UnlockPage(page); + return 1; + } + if (unlikely(page_count(page)) == 0) { + UnlockPage(page); + return 1; + } + pte_chain_lock(page); } - /* Indicate that we reached the end of address space */ - mm->swap_address = TASK_SIZE; -out_unlock: - spin_unlock(&mm->page_table_lock); - return count; -} + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page->pte.direct && page->mapping) { + switch (try_to_unmap(page)) { + case SWAP_ERROR: + case SWAP_FAIL: + goto page_active; + case SWAP_AGAIN: + pte_chain_unlock(page); + UnlockPage(page); + lru_unlock(zone); + cpu_relax(); + lru_lock(zone); + return 0; + case SWAP_SUCCESS: + ; /* fall through, try freeing the page below */ + /* fixme: add a SWAP_MLOCK case */ + } + } + pte_chain_unlock(page); -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) -{ - int counter, nr_pages = SWAP_CLUSTER_MAX; - struct mm_struct *mm; + if (PageDirty(page) && page->mapping) { + /* + * The page can be dirtied after we start writing, but + * in that case the dirty bit will simply be set again + * and we'll need to write it again. + */ + int (*writepage)(struct page *); - counter = mmlist_nr; - do { - if (unlikely(current->need_resched)) { - __set_current_state(TASK_RUNNING); - schedule(); - } + writepage = page->mapping->a_ops->writepage; + if ((gfp_mask & __GFP_FS) && writepage) { + ClearPageDirty(page); + SetPageLaunder(page); + page_cache_get(page); + lru_unlock(zone); + + writepage(page); - spin_lock(&mmlist_lock); - mm = swap_mm; - while (mm->swap_address == TASK_SIZE || mm == &init_mm) { - mm->swap_address = 0; - mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); - if (mm == swap_mm) - goto empty; - swap_mm = mm; + page_cache_release(page); + lru_lock(zone); + return 1; + } else { + del_page_from_inactive_laundry_list(page); + add_page_to_inactive_dirty_list(page); + /* FIXME: this is wrong for !__GFP_FS !!! */ + UnlockPage(page); + return 0; } + } - /* Make sure the mm doesn't disappear when we drop the lock.. */ - atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + */ + if (page->buffers) { + /* To avoid freeing our page before we're done. */ + page_cache_get(page); + lru_unlock(zone); - nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); + try_to_release_page(page, gfp_mask); + UnlockPage(page); - mmput(mm); + /* + * If the buffers were the last user of the page we free + * the page here. Because of that we shouldn't hold the + * lru lock yet. + */ + page_cache_release(page); - if (!nr_pages) - return 1; - } while (--counter >= 0); + lru_lock(zone); + return 1; + } + /* + * If the page is really freeable now, move it to the + * inactive_laundry list to keep LRU order. + * + * We re-test everything since the page could have been + * used by somebody else while we waited on IO above. + * This test is not safe from races; only the one in + * reclaim_page() needs to be. + */ + pte_chain_lock(page); + if (page->mapping && !PageDirty(page) && !page->pte.direct && + page_count(page) == 1) { + pte_chain_unlock(page); + UnlockPage(page); + return 1; + } else { + /* + * OK, we don't know what to do with the page. + * It's no use keeping it here, so we move it + * back to the active list. + */ + page_active: + activate_page_nolock(page); + pte_chain_unlock(page); + UnlockPage(page); + } return 0; +} -empty: - spin_unlock(&mmlist_lock); - return 0; +/* + * The aging interval varies from fast to really slow, it is + * important that we never age too fast and desirable that we + * keep the pages sorted in order for eviction. + * + * Note that while most of the time kscand's recalculating of + * the per zone aging interval should be good enough, we want + * the ability to do "emergency wakeups" here since memory zones + * can suddenly come under VM pressure. + */ +#define MAX_AGING_INTERVAL ((unsigned long)300*HZ) +#define MIN_AGING_INTERVAL ((unsigned long)HZ/2) +static void speedup_aging(struct zone_struct * zone) +{ + zone->need_scan++; + if (zone->need_scan > 3) { + unsigned long next_wakeup = jiffies + MIN_AGING_INTERVAL; + if (time_before(next_wakeup, zone->age_next)) + zone->age_next = next_wakeup; + } } -static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); -static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) +/* Ages down all pages on the active list */ +/* assumes the lru lock held */ +static inline void kachunk_anon(struct zone_struct * zone) { - struct list_head * entry; - int max_scan = nr_inactive_pages / priority; - int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10); - - spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { - struct page * page; - - if (unlikely(current->need_resched)) { - spin_unlock(&pagemap_lru_lock); - __set_current_state(TASK_RUNNING); - schedule(); - spin_lock(&pagemap_lru_lock); - continue; - } + int k; + if (!list_empty(&zone->active_anon_list[0])) + return; + if (!zone->active_anon_pages) + return; + + for (k = 0; k < MAX_AGE; k++) { + list_splice_init(&zone->active_anon_list[k+1], &zone->active_anon_list[k]); + zone->active_anon_count[k] = zone->active_anon_count[k+1]; + zone->active_anon_count[k+1] = 0; + } - page = list_entry(entry, struct page, lru); + zone->anon_age_bias++; + speedup_aging(zone); +} - BUG_ON(!PageLRU(page)); - BUG_ON(PageActive(page)); +static inline void kachunk_cache(struct zone_struct * zone) +{ + int k; + if (!list_empty(&zone->active_cache_list[0])) + return; + if (!zone->active_cache_pages) + return; + + for (k = 0; k < MAX_AGE; k++) { + list_splice_init(&zone->active_cache_list[k+1], &zone->active_cache_list[k]); + zone->active_cache_count[k] = zone->active_cache_count[k+1]; + zone->active_cache_count[k+1] = 0; + } - list_del(entry); - list_add(entry, &inactive_list); + zone->cache_age_bias++; + speedup_aging(zone); +} - /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. - */ - if (unlikely(!page_count(page))) - continue; +#define BATCH_WORK_AMOUNT 64 - if (!memclass(page_zone(page), classzone)) - continue; +/* + * returns the active cache ratio relative to the total active list + * times 100 (eg. 30% cache returns 30) + */ +static inline int cache_ratio(struct zone_struct * zone) +{ + if (!zone->size) + return 0; + return 100 * zone->active_cache_pages / (zone->active_cache_pages + + zone->active_anon_pages + 1); +} + +struct cache_limits cache_limits = { + .min = 1, + .borrow = 15, + .max = 100, +}; + +/** + * refill_inactive_zone - scan the active list and find pages to deactivate + * @priority: how much are we allowed to scan + * + * This function will scan a portion of the active list of a zone to find + * unused pages, those pages will then be moved to the inactive list. + */ +int refill_inactive_zone(struct zone_struct * zone, int priority, int target) +{ + int maxscan = (zone->active_anon_pages + zone->active_cache_pages) >> priority; + struct list_head * page_lru; + struct page * page; + int over_rsslimit; + int progress = 0; + int reclaim_anon = 0; + int reclaim_cache = 1; + + /* Take the lock while messing with the list... */ + lru_lock(zone); + if (target < BATCH_WORK_AMOUNT) + target = BATCH_WORK_AMOUNT; + + if (cache_ratio(zone) < cache_limits.borrow) + reclaim_anon = 1; + if (cache_ratio(zone) < cache_limits.min) + reclaim_cache = 0; + /* Could happen if the sysadmin sets borrow below min... */ + if (!reclaim_anon && !reclaim_cache) + reclaim_cache = reclaim_anon = 1; + + while (maxscan-- && zone->active_anon_pages + zone->active_cache_pages > 0 && target > 0) { + int anon_work = 0, cache_work = 0; + if (reclaim_anon) + anon_work = BATCH_WORK_AMOUNT; + if (reclaim_cache) + cache_work = BATCH_WORK_AMOUNT; + + while (--anon_work >= 0 && zone->active_anon_pages) { + if (list_empty(&zone->active_anon_list[0])) { + kachunk_anon(zone); + continue; + } - /* Racy check to avoid trylocking when not worthwhile */ - if (!page->buffers && (page_count(page) != 1 || !page->mapping)) - goto page_mapped; + page_lru = zone->active_anon_list[0].prev; + page = list_entry(page_lru, struct page, lru); - /* - * The page is locked. IO in progress? - * Move it to the back of the list. - */ - if (unlikely(TryLockPage(page))) { - if (PageLaunder(page) && (gfp_mask & __GFP_FS)) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - wait_on_page(page); - page_cache_release(page); - spin_lock(&pagemap_lru_lock); + /* Wrong page on list?! (list corruption, should not happen) */ + BUG_ON(unlikely(!PageActiveAnon(page))); + + /* Needed to follow page->mapping */ + if (TryLockPage(page)) { + /* The page is already locked. This for sure means + * someone is doing stuff with it which makes it + * active by definition ;) + */ + del_page_from_active_anon_list(page); + add_page_to_active_anon_list(page, INITIAL_AGE); + continue; } - continue; - } - if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) { /* - * It is not critical here to write it only if - * the page is unmapped beause any direct writer - * like O_DIRECT would set the PG_dirty bitflag - * on the phisical page after having successfully - * pinned it and after the I/O to the page is finished, - * so the direct writes to the page cannot get lost. + * Do aging on the pages. */ - int (*writepage)(struct page *); - - writepage = page->mapping->a_ops->writepage; - if ((gfp_mask & __GFP_FS) && writepage) { - ClearPageDirty(page); - SetPageLaunder(page); - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - - writepage(page); - page_cache_release(page); - - spin_lock(&pagemap_lru_lock); + pte_chain_lock(page); + if (page_referenced(page, &over_rsslimit) && !over_rsslimit) { + pte_chain_unlock(page); + age_page_up_nolock(page, 0); + UnlockPage(page); continue; } + pte_chain_unlock(page); + + deactivate_page_nolock(page); + target--; + progress++; + UnlockPage(page); } - /* - * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we try to free - * the page as well. - */ - if (page->buffers) { - spin_unlock(&pagemap_lru_lock); + while (--cache_work >= 0 && zone->active_cache_pages) { + if (list_empty(&zone->active_cache_list[0])) { + kachunk_cache(zone); + continue; + } - /* avoid to free a locked page */ - page_cache_get(page); + page_lru = zone->active_cache_list[0].prev; + page = list_entry(page_lru, struct page, lru); - if (try_to_release_page(page, gfp_mask)) { - if (!page->mapping) { - /* - * We must not allow an anon page - * with no buffers to be visible on - * the LRU, so we unlock the page after - * taking the lru lock - */ - spin_lock(&pagemap_lru_lock); - UnlockPage(page); - __lru_cache_del(page); - - /* effectively free the page here */ - page_cache_release(page); - - if (--nr_pages) - continue; - break; - } else { - /* - * The page is still in pagecache so undo the stuff - * before the try_to_release_page since we've not - * finished and we can now try the next step. - */ - page_cache_release(page); + /* Wrong page on list?! (list corruption, should not happen) */ + BUG_ON(unlikely(!PageActiveCache(page))); + + /* Needed to follow page->mapping */ + if (TryLockPage(page)) { + /* The page is already locked. This for sure means + * someone is doing stuff with it which makes it + * active by definition ;) + */ + del_page_from_active_cache_list(page); + add_page_to_active_cache_list(page, INITIAL_AGE); + continue; + } - spin_lock(&pagemap_lru_lock); - } - } else { - /* failed to drop the buffers so stop here */ + /* + * Do aging on the pages. + */ + pte_chain_lock(page); + if (page_referenced(page, &over_rsslimit) && !over_rsslimit) { + pte_chain_unlock(page); + age_page_up_nolock(page, 0); UnlockPage(page); - page_cache_release(page); - - spin_lock(&pagemap_lru_lock); continue; } + pte_chain_unlock(page); + + deactivate_page_nolock(page); + target--; + progress++; + UnlockPage(page); } + } + lru_unlock(zone); + + return progress; +} - spin_lock(&pagecache_lock); +static int need_active_anon_scan(struct zone_struct * zone) +{ + int low = 0, high = 0; + int k; + for (k=0; k < MAX_AGE/2; k++) + low += zone->active_anon_count[k]; - /* - * this is the non-racy check for busy page. - */ - if (!page->mapping || !is_page_cache_freeable(page)) { - spin_unlock(&pagecache_lock); - UnlockPage(page); -page_mapped: - if (--max_mapped >= 0) - continue; + for (k=MAX_AGE/2; k <= MAX_AGE; k++) + high += zone->active_anon_count[k]; + + if (highactive_cache_count[k]; + + for (k=MAX_AGE/2; k <= MAX_AGE; k++) + high += zone->active_cache_count[k]; + + if (highinactive_laundry_list)) { + page_lru = zone->inactive_laundry_list.prev; + page = list_entry(page_lru, struct page, lru); + /* Wrong page on list?! (list corruption, should not happen) */ + BUG_ON(unlikely(!PageInactiveLaundry(page))); + + /* TryLock to see if the page IO is done */ + if (TryLockPage(page)) { /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! + * Page is locked (IO in progress?). If we can sleep, + * wait for it to finish, except when we've already + * done enough work. */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; + if ((gfp_mask & __GFP_WAIT) && (work_done < max_work)) { + int timed_out; + + page_cache_get(page); + lru_unlock(zone); + run_task_queue(&tq_disk); + timed_out = wait_on_page_timeout(page, 5 * HZ); + page_cache_release(page); + lru_lock(zone); + /* + * If we timed out and the page has been in + * flight for over 30 seconds, this might not + * be the best page to wait on; move it to + * the head of the dirty list. + */ + if (timed_out & PageInactiveLaundry(page)) { + unsigned char now; + now = (jiffies/HZ)&255; + if (now - page->age > 30) { + del_page_from_inactive_laundry_list(page); + add_page_to_inactive_dirty_list(page); + } + continue; + } + /* We didn't make any progress for our caller, + * but we are actively avoiding a livelock + * so undo the decrement and wait on this page + * some more, until IO finishes or we timeout. + */ + max_loop++; + continue; + } else + /* No dice, we can't wait for IO */ + break; } + UnlockPage(page); /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. + * If we get here either the IO on the page is done or + * IO never happened because it was clean. Either way + * move it to the inactive clean list. */ - if (PageDirty(page)) { - spin_unlock(&pagecache_lock); - UnlockPage(page); - continue; - } - - /* point of no return */ - if (likely(!PageSwapCache(page))) { - __remove_inode_page(page); - spin_unlock(&pagecache_lock); - } else { - swp_entry_t swap; - swap.val = page->index; - __delete_from_swap_cache(page); - spin_unlock(&pagecache_lock); - swap_free(swap); - } - __lru_cache_del(page); - UnlockPage(page); + /* FIXME: check if the page is still clean or is accessed ? */ - /* effectively free the page here */ - page_cache_release(page); + del_page_from_inactive_laundry_list(page); + add_page_to_inactive_clean_list(page); + work_done++; - if (--nr_pages) - continue; - break; + /* + * If we've done the minimal batch of work and there's + * no longer a need to rebalance, abort now. + */ + if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_laundry(zone))) + break; } - spin_unlock(&pagemap_lru_lock); - return nr_pages; + lru_unlock(zone); + return work_done; } /* - * This moves pages from the active list to - * the inactive list. - * - * We move them the other way when we see the - * reference bit on the page. + * Move max_work pages from the dirty list as long as there is a need. + * Start IO if the gfp_mask allows it. */ -static void refill_inactive(int nr_pages) +int rebalance_dirty_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask) { - struct list_head * entry; - - spin_lock(&pagemap_lru_lock); - entry = active_list.prev; - while (nr_pages && entry != &active_list) { - struct page * page; - - page = list_entry(entry, struct page, lru); - entry = entry->prev; - if (PageTestandClearReferenced(page)) { - list_del(&page->lru); - list_add(&page->lru, &active_list); + struct list_head * page_lru; + int max_loop; + int work_done = 0; + struct page * page; + + max_loop = max_work; + if (max_loop < BATCH_WORK_AMOUNT) + max_loop = BATCH_WORK_AMOUNT; + /* Take the lock while messing with the list... */ + lru_lock(zone); + while (max_loop-- && !list_empty(&zone->inactive_dirty_list)) { + page_lru = zone->inactive_dirty_list.prev; + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + BUG_ON(unlikely(!PageInactiveDirty(page))); + + /* + * Note: launder_page() sleeps so we can't safely look at + * the page after this point! + * + * If we fail (only happens if we can't do IO) we just try + * again on another page; launder_page makes sure we won't + * see the same page over and over again. + */ + if (!launder_page(zone, gfp_mask, page)) continue; - } - nr_pages--; + work_done++; - del_page_from_active_list(page); - add_page_to_inactive_list(page); - SetPageReferenced(page); + /* + * If we've done the minimal batch of work and there's + * no longer any need to rebalance, abort now. + */ + if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_dirty(zone))) + break; } - spin_unlock(&pagemap_lru_lock); + lru_unlock(zone); + + return work_done; } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +/* goal percentage sets the goal of the laundry+clean+free of the total zone size */ +int rebalance_inactive_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask, int goal_percentage) { - int chunk_size = nr_pages; - unsigned long ratio; + int ret = 0; + /* first deactivate memory */ + if (((zone->inactive_laundry_pages + zone->inactive_clean_pages + zone->free_pages)*100 < zone->size * goal_percentage) && + (inactive_high(zone) > 0)) + refill_inactive_zone(zone, 0, max_work + BATCH_WORK_AMOUNT); + + if (need_rebalance_dirty(zone)) + ret += rebalance_dirty_zone(zone, max_work, gfp_mask); + if (need_rebalance_laundry(zone)) + ret += rebalance_laundry_zone(zone, max_work, gfp_mask); - nr_pages -= kmem_cache_reap(gfp_mask); - if (nr_pages <= 0) - return 0; + /* These pages will become freeable, let the OOM detection know */ + ret += zone->inactive_laundry_pages; - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); + return ret; +} - nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority); - if (nr_pages <= 0) - return 0; +int rebalance_inactive(unsigned int gfp_mask, int percentage) +{ + struct zone_struct * zone; + int max_work; + int ret = 0; + + max_work = 4 * BATCH_WORK_AMOUNT; + /* If we're in deeper trouble, do more work */ + if (percentage >= 50) + max_work = 8 * BATCH_WORK_AMOUNT; + + for_each_zone(zone) + ret += rebalance_inactive_zone(zone, max_work, gfp_mask, percentage); + /* 4 * BATCH_WORK_AMOUNT needs tuning */ - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif + return ret; +} + +/** + * background_aging - slow background aging of zones + * @priority: priority at which to scan + * + * When the VM load is low or nonexistant, this function is + * called once a second to "sort" the pages in the VM. This + * way we know which pages to evict once a load spike happens. + * The effects of this function are very slow, the CPU usage + * should be minimal to nonexistant under most loads. + */ +static inline void background_aging(int priority) +{ + struct zone_struct * zone; - return nr_pages; + for_each_zone(zone) + if (inactive_low(zone) > 0) + refill_inactive_zone(zone, priority, BATCH_WORK_AMOUNT); + for_each_zone(zone) + if (free_plenty(zone) > 0) + rebalance_dirty_zone(zone, BATCH_WORK_AMOUNT, GFP_KSWAPD); } -int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask) +/* + * Worker function for kswapd and try_to_free_pages, we get + * called whenever there is a shortage of free/inactive_clean + * pages. + * + * This function will also move pages to the inactive list, + * if needed. + */ +static int do_try_to_free_pages(unsigned int gfp_mask) { - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + int ret = 0; - gfp_mask = pf_gfp_mask(gfp_mask); - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + /* + * Eat memory from filesystem page cache, buffer cache, + * dentry, inode and filesystem quota caches. + */ + ret += rebalance_inactive(gfp_mask, 100); + ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_icache_memory(1, gfp_mask); +#ifdef CONFIG_QUOTA + ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); +#endif + + /* + * Reclaim unused slab cache memory. + */ + ret += kmem_cache_reap(gfp_mask); /* * Hmm.. Cache shrink failed - time to kill something? * Mhwahahhaha! This is the part I really like. Giggle. */ - out_of_memory(); - return 0; -} - -int try_to_free_pages(unsigned int gfp_mask) -{ - pg_data_t *pgdat; - zonelist_t *zonelist; - unsigned long pf_free_pages; - int error = 0; - - pf_free_pages = current->flags & PF_FREE_PAGES; - current->flags &= ~PF_FREE_PAGES; + if (!ret && free_low(ANY_ZONE) && (gfp_mask&__GFP_WAIT)) + out_of_memory(); - for_each_pgdat(pgdat) { - zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); - error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask); - } - - current->flags |= pf_free_pages; - return error; + return ret; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - -static int check_classzone_need_balance(zone_t * classzone) +/* + * Worker function for kswapd and try_to_free_pages, we get + * called whenever there is a shortage of free/inactive_clean + * pages. + * + * This function will also move pages to the inactive list, + * if needed. + */ +static int do_try_to_free_pages_kswapd(unsigned int gfp_mask) { - zone_t * first_classzone; + int ret = 0; + struct zone_struct * zone; - first_classzone = classzone->zone_pgdat->node_zones; - while (classzone >= first_classzone) { - if (classzone->free_pages > classzone->pages_high) - return 0; - classzone--; - } - return 1; -} + ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_icache_memory(DEF_PRIORITY, gfp_mask); + ret += try_to_reclaim_buffers(DEF_PRIORITY, gfp_mask); +#ifdef CONFIG_QUOTA + ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); +#endif -static int kswapd_balance_pgdat(pg_data_t * pgdat) -{ - int need_more_balance = 0, i; - zone_t * zone; + /* + * Eat memory from filesystem page cache, buffer cache, + * dentry, inode and filesystem quota caches. + */ + rebalance_inactive(gfp_mask, 5); - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (unlikely(current->need_resched)) - schedule(); - if (!zone->need_balance) - continue; - if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) { - zone->need_balance = 0; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - continue; - } - if (check_classzone_need_balance(zone)) - need_more_balance = 1; - else - zone->need_balance = 0; + for_each_zone(zone) { + int maxloop = zone->inactive_dirty_pages; + maxloop = maxloop / (16 * BATCH_WORK_AMOUNT) + 1; + while (need_rebalance_dirty(zone) && maxloop-- > 0) + rebalance_dirty_zone(zone, 16 * BATCH_WORK_AMOUNT, gfp_mask); } - return need_more_balance; -} + for_each_zone(zone) + if (free_high(zone)>0) + rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0); -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; + refill_freelist(); - do { - need_more_balance = 0; + /* Start IO when needed. */ + if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0) + run_task_queue(&tq_disk); - for_each_pgdat(pgdat) - need_more_balance |= kswapd_balance_pgdat(pgdat); - } while (need_more_balance); + return ret; } -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) +/** + * refill_freelist - move inactive_clean pages to free list if needed + * + * Move some pages from the inactive_clean lists to the free + * lists so atomic allocations have pages to work from. This + * function really only does something when we don't have a + * userspace load on __alloc_pages(). + * + * We refill the freelist in a bump from pages_min to pages_min * 2 + * in order to give the buddy allocator something to play with. + */ +static void refill_freelist(void) { + struct page * page; zone_t * zone; - int i; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) + for_each_zone(zone) { + if (!zone->size || zone->free_pages >= zone->pages_min) continue; - return 0; - } - - return 1; -} -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; - - for_each_pgdat(pgdat) { - if (!kswapd_can_sleep_pgdat(pgdat)) - return 0; + while (zone->free_pages < zone->pages_min * 2) { + page = reclaim_page(zone); + if (!page) + break; + __free_page(page); + } } - - return 1; } /* @@ -720,7 +976,6 @@ int kswapd(void *unused) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); daemonize(); strcpy(tsk->comm, "kswapd"); @@ -744,31 +999,229 @@ * Kswapd main loop. */ for (;;) { - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + static long recalc = 0; - mb(); - if (kswapd_can_sleep()) - schedule(); + /* + * We try to rebalance the VM either when we have a + * global shortage of free pages or when one particular + * zone is very short on free pages. + */ + if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0) + do_try_to_free_pages_kswapd(GFP_KSWAPD); + + refill_freelist(); + + /* Once a second ... */ + if (time_after(jiffies, recalc + HZ)) { + recalc = jiffies; + + /* Do background page aging. */ + background_aging(DEF_PRIORITY); + } + + wakeup_memwaiters(); + } +} + +static int kswapd_overloaded; +unsigned int kswapd_minfree; /* initialized in mm/page_alloc.c */ +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +DECLARE_WAIT_QUEUE_HEAD(kswapd_done); + +/** + * wakeup_kswapd - wake up the pageout daemon + * gfp_mask: page freeing flags + * + * This function wakes up kswapd and can, under heavy VM pressure, + * put the calling task to sleep temporarily. + */ +void wakeup_kswapd(unsigned int gfp_mask) +{ + DECLARE_WAITQUEUE(wait, current); + + /* If we're in the memory freeing business ourself, don't sleep + * but just wake kswapd and go back to businesss. + */ + if (current->flags & PF_MEMALLOC) { + wake_up_interruptible(&kswapd_wait); + return; + } - __set_current_state(TASK_RUNNING); + /* We need all of kswapd's GFP flags, otherwise we can't sleep on it. + * We still wake kswapd of course. + */ + if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) { + wake_up_interruptible(&kswapd_wait); + return; + } + + add_wait_queue(&kswapd_done, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Wake kswapd .... */ + wake_up_interruptible(&kswapd_wait); + + /* ... and check if we need to wait on it */ + if ((free_low(ALL_ZONES) > (kswapd_minfree / 2)) && !kswapd_overloaded) + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_done, &wait); +} + +static void wakeup_memwaiters(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&kswapd_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + /* Don't let the processes waiting on memory get stuck, ever. */ + wake_up(&kswapd_done); + + /* Enough free RAM, we can easily keep up with memory demand. */ + if (free_high(ALL_ZONES) <= 0) { + schedule_timeout(HZ); remove_wait_queue(&kswapd_wait, &wait); + return; + } + remove_wait_queue(&kswapd_wait, &wait); - /* - * If we actually get into a low-memory situation, - * the processes needing more memory will wake us - * up on a more timely basis. - */ - kswapd_balance(); - run_task_queue(&tq_disk); + /* OK, the VM is very loaded. Sleep instead of using all CPU. */ + kswapd_overloaded = 1; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 40); + kswapd_overloaded = 0; + return; +} + +/** + * try_to_free_pages - run the pageout code ourselves + * gfp_mask: mask of things the pageout code is allowed to do + * + * When the load on the system gets higher, it can happen + * that kswapd no longer manages to keep enough memory + * free. In those cases user programs allocating memory + * will call try_to_free_pages() and help the pageout code. + * This has the effects of freeing memory and slowing down + * the largest memory hogs a bit. + */ +int try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 1; + + gfp_mask = pf_gfp_mask(gfp_mask); + if (gfp_mask & __GFP_WAIT) { + current->flags |= PF_MEMALLOC; + ret = do_try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; } + + return ret; +} + +/** + * rss_free_pages - run part of the pageout code and slow down a bit + * @gfp_mask: mask of things the pageout code is allowed to do + * + * This function is called when a task is over its RSS limit and + * has a page fault. It's goal is to free some memory so non-hogs + * can run faster and slow down itself when needed so it won't eat + * the memory non-hogs can use. + */ +void rss_free_pages(unsigned int gfp_mask) +{ + long pause = 0; + struct zone_struct * zone; + + if (current->flags & PF_MEMALLOC) + return; + + current->flags |= PF_MEMALLOC; + + do { + rebalance_inactive(gfp_mask, 100); + for_each_zone(zone) + if (free_plenty(zone) >= 0) + rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(pause); + set_current_state(TASK_RUNNING); + pause++; + } while (free_high(ALL_ZONES) >= 0); + + current->flags &= ~PF_MEMALLOC; + return; } +/* + * The background page scanning daemon, started as a kernel thread + * from the init process. + * + * This is the part that background scans the active list to find + * pages that are referenced and increases their age score. + * It is important that this scan rate is not proportional to vm pressure + * per se otherwise cpu usage becomes unbounded. On the other hand, if there's + * no VM pressure at all it shouldn't age stuff either otherwise everything + * ends up at the maximum age. + */ +int kscand(void *unused) +{ + struct task_struct *tsk = current; + struct zone_struct * zone; + unsigned long iv; + int age; + + daemonize(); + strcpy(tsk->comm, "kscand"); + sigfillset(&tsk->blocked); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(MIN_AGING_INTERVAL); + for_each_zone(zone) { + if (time_before(jiffies, zone->age_next)) + continue; + + if (need_active_anon_scan(zone)) { + for (age = 0; age < MAX_AGE; age++) { + scan_active_list(zone, age, + &zone->active_anon_list[age]); + if (current->need_resched) + schedule(); + } + } + + if (need_active_cache_scan(zone)) { + for (age = 0; age < MAX_AGE; age++) { + scan_active_list(zone, age, + &zone->active_cache_list[age]); + if (current->need_resched) + schedule(); + } + } + + iv = zone->age_interval; + /* Check if we've been aging quickly enough ... */ + if (zone->need_scan >= 2) + iv = max(iv / 2, MIN_AGING_INTERVAL); + /* ... or too quickly. */ + else if (!zone->need_scan) + iv = max(iv + (iv / 2), MAX_AGING_INTERVAL); + zone->need_scan = 0; + zone->age_interval = iv; + zone->age_next = jiffies + iv; + } + } +} + + static int __init kswapd_init(void) { printk("Starting kswapd\n"); swap_setup(); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + kernel_thread(kscand, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); return 0; }