# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.1101  -> 1.1110 
#	       mm/oom_kill.c	1.9.1.2 -> 1.13   
#	include/asm-ppc64/pgtable.h	1.5     -> 1.7    
#	arch/ppc64/kernel/pmc.c	1.4     -> 1.5    
#	arch/i386/mm/fault.c	1.13    -> 1.14   
#	drivers/sgi/char/graphics.c	1.6     -> 1.7    
#	include/asm-s390x/page.h	1.7     -> 1.8    
#	include/linux/list.h	1.9     -> 1.10   
#	include/linux/mmzone.h	1.7.1.2 -> 1.19   
#	include/linux/pagemap.h	1.19    -> 1.21   
#	      kernel/ksyms.c	1.64.1.3 -> 1.67   
#	include/linux/swap.h	1.31.1.5 -> 1.44   
#	include/linux/elevator.h	1.4.1.1 -> 1.7    
#	  include/linux/mm.h	1.38.1.2 -> 1.55   
#	     fs/proc/array.c	1.9     -> 1.10   
#	include/asm-i386/fixmap.h	1.4.1.1 -> 1.6    
#	include/asm-i386/pgalloc.h	1.9     -> 1.10   
#	           mm/mmap.c	1.23.1.6 -> 1.28   
#	drivers/s390/ccwcache.c	1.3.1.1 -> 1.5    
#	     mm/page_alloc.c	1.43.1.13 -> 1.64   
#	include/asm-i386/highmem.h	1.4.1.1 -> 1.6    
#	drivers/char/drm/drm_proc.h	1.5     -> 1.6    
#	include/linux/sched.h	1.29.1.7 -> 1.34   
#	       kernel/fork.c	1.23.1.4 -> 1.27   
#	drivers/block/ll_rw_blk.c	1.34.1.9 -> 1.38   
#	        kernel/sys.c	1.9.1.3 -> 1.12   
#	     kernel/sysctl.c	1.16.1.3 -> 1.21   
#	arch/i386/kernel/vm86.c	1.5     -> 1.6    
#	            Makefile	1.161.1.46 -> 1.182  
#	include/linux/swapctl.h	1.2     -> 1.4    
#	         fs/dcache.c	1.16.1.6 -> 1.23   
#	          fs/dquot.c	1.16.1.1 -> 1.19   
#	         mm/vmscan.c	1.59.1.3 -> 1.97   
#	 fs/proc/proc_misc.c	1.13.1.7 -> 1.20   
#	include/asm-s390x/pgalloc.h	1.6     -> 1.7    
#	arch/s390/mm/ioremap.c	1.3     -> 1.4    
#	include/asm-i386/page.h	1.10    -> 1.11   
#	include/asm-s390/page.h	1.6     -> 1.7    
#	         init/main.c	1.26    -> 1.27   
#	 arch/s390/mm/init.c	1.8     -> 1.9    
#	arch/ppc64/mm/init.c	1.4     -> 1.5    
#	arch/x86_64/mm/pageattr.c	1.2     -> 1.3    
#	arch/x86_64/mm/init.c	1.5     -> 1.6    
#	       mm/swapfile.c	1.23.1.3 -> 1.28   
#	 arch/i386/mm/init.c	1.13.1.1 -> 1.15   
#	  include/linux/fs.h	1.60.1.14 -> 1.68   
#	arch/i386/mm/Makefile	1.2     -> 1.3    
#	arch/i386/mm/ioremap.c	1.4     -> 1.5    
#	           mm/numa.c	1.4     -> 1.6    
#	        mm/bootmem.c	1.6.1.3 -> 1.9    
#	arch/x86_64/mm/ioremap.c	1.3     -> 1.4    
#	 arch/i386/config.in	1.35.1.6 -> 1.37   
#	arch/s390x/mm/init.c	1.9     -> 1.10   
#	arch/i386/mm/pageattr.c	1.2     -> 1.3    
#	        mm/filemap.c	1.62.1.14 -> 1.85   
#	include/linux/brlock.h	1.3     -> 1.4    
#	           fs/exec.c	1.20.1.4 -> 1.26   
#	           mm/swap.c	1.16.1.1 -> 1.27   
#	       mm/mprotect.c	1.4     -> 1.5    
#	include/asm-s390x/pgtable.h	1.8     -> 1.9    
#	          mm/shmem.c	1.45.1.1 -> 1.48   
#	     mm/swap_state.c	1.17.1.1 -> 1.21   
#	include/asm-x86_64/pgtable.h	1.4     -> 1.5    
#	         mm/memory.c	1.50.1.3 -> 1.60   
#	include/linux/slab.h	1.8.1.2 -> 1.12   
#	arch/i386/kernel/setup.c	1.37.1.26 -> 1.43   
#	arch/s390x/mm/ioremap.c	1.2     -> 1.3    
#	  drivers/char/mem.c	1.17    -> 1.18   
#	include/asm-ppc64/pgalloc.h	1.2     -> 1.4    
#	include/asm-i386/pgtable-2level.h	1.3     -> 1.4    
#	         mm/mremap.c	1.5     -> 1.8    
#	        mm/vmalloc.c	1.13.1.2 -> 1.15   
#	         fs/buffer.c	1.61.1.21 -> 1.69   
#	include/asm-s390/pgalloc.h	1.6     -> 1.7    
#	include/asm-x86_64/page.h	1.3     -> 1.4    
#	include/asm-ppc64/page.h	1.4     -> 1.5    
#	         mm/Makefile	1.5     -> 1.6    
#	arch/arm/mm/mm-armv.c	1.5     -> 1.6    
#	arch/ppc64/kernel/htab.c	1.5     -> 1.6    
#	include/asm-i386/kmap_types.h	1.6     -> 1.7    
#	drivers/ieee1394/ieee1394_types.h	1.10    -> 1.11   
#	include/asm-i386/pgtable.h	1.8     -> 1.9    
#	include/linux/highmem.h	1.10.1.1 -> 1.12   
#	drivers/block/elevator.c	1.5.1.3 -> 1.8    
#	           mm/slab.c	1.14.1.8 -> 1.21   
#	include/linux/module.h	1.12    -> 1.13   
#	include/asm-s390/pgtable.h	1.8     -> 1.9    
#	include/asm-i386/pgtable-3level.h	1.4     -> 1.5    
#	include/asm-x86_64/io.h	1.3     -> 1.4    
#	include/asm-x86_64/pgalloc.h	1.2     -> 1.3    
#	          fs/inode.c	1.32.1.4 -> 1.38   
#	arch/x86_64/mm/fault.c	1.7     -> 1.8    
#	               (new)	        -> 1.1     include/asm-alpha/rmap.h
#	               (new)	        -> 1.14    mm/rmap.c      
#	               (new)	        -> 1.1     include/asm-sparc64/rmap.h
#	               (new)	        -> 1.2     include/asm-i386/rmap.h
#	               (new)	        -> 1.1     arch/i386/mm/pgtable.c
#	               (new)	        -> 1.2     include/asm-generic/rmap.h
#	               (new)	        -> 1.1     include/asm-s390x/rmap.h
#	               (new)	        -> 1.1     include/asm-ppc/rmap.h
#	               (new)	        -> 1.1     include/asm-arm/rmap.h
#	               (new)	        -> 1.1     include/asm-mips/rmap.h
#	               (new)	        -> 1.9     include/linux/mm_inline.h
#	               (new)	        -> 1.1     include/asm-s390/rmap.h
#	               (new)	        -> 1.1     include/asm-sh/rmap.h
#	               (new)	        -> 1.1     include/asm-m68k/rmap.h
#	               (new)	        -> 1.1     include/asm-ia64/rmap.h
#	               (new)	        -> 1.1     include/asm-x86_64/rmap.h
#	               (new)	        -> 1.22    Changelog.rmap 
#	               (new)	        -> 1.1     mm/TODO        
#	               (new)	        -> 1.1     include/asm-sparc/rmap.h
#	               (new)	        -> 1.1     include/asm-mips64/rmap.h
#	               (new)	        -> 1.1     include/asm-arm/proc-armv/rmap.h
#	               (new)	        -> 1.1     include/asm-ppc64/rmap.h
#	               (new)	        -> 1.1     include/asm-cris/rmap.h
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 03/04/03	riel@chimarrao.boston.redhat.com	1.757.33.29
# add barrier() to page_chain_lock()  (Pete Zaitcev)
# --------------------------------------------
# 03/04/11	riel@cluless.boston.redhat.com	1.757.33.30
# pte-highmem defines for ppc64 (Julie DeWandel)
# --------------------------------------------
# 03/04/11	riel@chimarrao.boston.redhat.com	1.757.33.31
# pte-highmem updates for s390 and s390x  (Pete Zaitcev)
# --------------------------------------------
# 03/04/11	riel@cluless.boston.redhat.com	1.757.33.32
# rmap 15f release
# --------------------------------------------
# 03/04/12	riel@imladris.surriel.com	1.1102
# merge up to 2.4.21-pre7
# --------------------------------------------
# 03/04/12	riel@imladris.surriel.com	1.1103
# fix up /dev/mem pte-highmem awareness
# --------------------------------------------
# 03/04/15	riel@chimarrao.boston.redhat.com	1.757.33.33
# ooops, forgot to add asm-ppc64/rmap.h ...
# --------------------------------------------
# 03/04/15	riel@cluless.boston.redhat.com	1.1104
# Merge linuxvm@linuxvm.bkbits.net:linux-2.4-rmap
# into cluless.boston.redhat.com:/home/boston/riel/bk/linux-2.4-rmap
# --------------------------------------------
# 03/04/16	riel@chimarrao.boston.redhat.com	1.1105
# additional ppc64 pte-highmem fix
# --------------------------------------------
# 03/04/16	riel@chimarrao.boston.redhat.com	1.1106
# yet another piece of the ppc64 pte-highmem puzzle ;)
# --------------------------------------------
# 03/04/16	riel@chimarrao.boston.redhat.com	1.1107
# x86_64 rmap bits (Jim Paradis)
# --------------------------------------------
# 03/04/17	riel@cluless.boston.redhat.com	1.1108
# awwww shucks, forgot to check in asm-x86_64/rmap.h ...
# --------------------------------------------
# 03/04/17	riel@cluless.boston.redhat.com	1.1109
# reclaim buffer heads under memory pressure
# --------------------------------------------
# 03/04/17	riel@cluless.boston.redhat.com	1.1110
# rmap 15g release
# --------------------------------------------
#
diff -Nru a/Changelog.rmap b/Changelog.rmap
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/Changelog.rmap	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,245 @@
+The seventh maintenance release of the 15th version of the reverse
+mapping based VM is now available.
+This is an attempt at making a more robust and flexible VM
+subsystem, while cleaning up a lot of code at the same time.
+The patch is available from:
+
+           http://surriel.com/patches/2.4/2.4.20-rmap15g
+and        http://linuxvm.bkbits.net/
+
+
+My big TODO items for a next release are:
+  - finetune the O(1) VM code for strange corner cases
+  - add pte-highmem defines for more architectures
+  - highmem tweaks
+
+
+rmap 15g:
+  - more ppc64 pte-highmem stuff                          (Julie DeWandel)
+  - hammer pte-highmem stuff                              (Jim Paradis)
+  - reclaim buffer heads under memory pressure            (me)
+rmap 15f:
+  - remove pte-highmem compat define from ieee1394        (Marc-C. Petersen)
+  - clean up scan_active_list after suggestion from hch   (me)
+  - lock ordering fix                                     (me)
+  - add barrier() to page_chain_lock()                    (Pete Zaitcev)
+  - fix pte-highmem defines for ppc64                     (Julie DeWandel)
+  - add pte-highmem defines for s390 & s390x              (Pete Zaitcev)
+rmap 15e:
+  - make reclaiming unused inodes more efficient          (Arjan van de Ven)
+    | push to Marcelo and Andrew once it's well tested !
+  - fix DRM memory leak                                   (Arjan van de Ven)
+  - fix potential infinite loop in kswapd                 (me)
+  - clean up elevator.h (no IO scheduler in -rmap...)     (me)
+  - page aging interval tuned on a per zone basis, better
+    wakeup mechanism for sudden memory pressure           (Arjan, me)
+rmap 15d:
+  - compatability with PREEMPT patch                      (me)
+    | fairly ugly, but should work
+  - bugfix for the pte_chain allocation code              (Arjan van de Ven)
+rmap 15c:
+  - backport and audit akpm's reliable pte_chain alloc
+    code from 2.5                                         (me)
+  - reintroduce cache size tuning knobs in /proc          (me)
+    | on very, very popular request
+rmap 15b:
+  - adjust anon/cache work table                          (me)
+  - make active_age_bias a per-active list thing          (me)
+  - don't wake up kswapd early from mark_page_accessed    (me)
+  - make sure pte-chains are cacheline aligned with PAE   (me, Andrew Morton)
+  - change some O(1) VM thresholds                        (me)
+  - fix pte-highmem backport                              (me)
+  - 2.5 backport: pte-highmem                             (Ben LaHaise)
+  - 2.5 backport: large cacheline aligned pte-chains      (Ben LaHaise)
+  - 2.5 backport: direct pte pointers                     (Ben LaHaise)
+  - undo __find_pagecache_page braindamage		  (Christoph Hellwig)
+rmap 15a:
+  - more agressive freeing for higher order allocations   (me)
+  - export __find_pagecache_page, find_get_page define    (me, Christoph, Arjan)
+  - make memory statistics SMP safe again                 (me)
+  - make page aging slow down again when needed           (Andrew Morton)
+  - first stab at fine-tuning arjan's O(1) VM             (me)
+  - split active list in cache / working set              (me)
+  - fix SMP locking in arjan's O(1) VM                    (me)
+rmap 15:
+  - small code cleanups and spelling fixes for O(1) VM    (me)
+  - O(1) page launder, O(1) page aging                    (Arjan van de Ven)
+  - resync code with -ac (12 small patches)               (me)
+rmap 14c:
+  - fold page_over_rsslimit() into page_referenced()      (me)
+  - 2.5 backport: get pte_chains from the slab cache      (William Lee Irwin)
+  - remove dead code from page_launder_zone()             (me)
+  - make OOM detection a bit more agressive               (me)
+rmap 14b:
+  - don't unmap pages not in pagecache (ext3 & reiser)    (Andrew Morton, me)
+  - clean up mark_page_accessed a bit                     (me)
+  - Alpha NUMA fix for Ingo's per-cpu pages               (Flávio Leitner, me)
+  - remove explicit low latency schedule zap_page_range   (Robert Love)
+  - fix OOM stuff for good, hopefully                     (me)
+rmap 14a:
+  - Ingo Molnar's per-cpu pages (SMP speedup)             (Christoph Hellwig)
+  - fix SMP bug in page_launder_zone (rmap14 only)        (Arjan van de Ven) 
+  - semicolon day, fix typo in rmap.c w/ DEBUG_RMAP       (Craig Kulesa)
+  - remove unneeded pte_chain_unlock/lock pair vmscan.c   (Craig Kulesa)
+  - low latency zap_page_range also without preempt       (Arjan van de Ven)
+  - do some throughput tuning for kswapd/page_launder     (me)
+  - don't allocate swap space for pages we're not writing (me)
+rmap 14:
+  - get rid of stalls during swapping, hopefully          (me)
+  - low latency zap_page_range                            (Robert Love)
+rmap 13c:
+  - add wmb() to wakeup_memwaiters                        (Arjan van de Ven)
+  - remap_pmd_range now calls pte_alloc with full address (Paul Mackerras)
+  - #ifdef out pte_chain_lock/unlock on UP machines       (Andrew Morton)
+  - un-BUG() truncate_complete_page, the race is expected (Andrew Morton, me)
+  - remove NUMA changes from rmap13a                      (Christoph Hellwig)
+rmap 13b:
+  - prevent PF_MEMALLOC recursion for higher order allocs (Arjan van de Ven, me)
+  - fix small SMP race, PG_lru                            (Hugh Dickins)
+rmap 13a:
+  - NUMA changes for page_address                         (Samuel Ortiz)
+  - replace vm.freepages with simpler kswapd_minfree      (Christoph Hellwig)
+rmap 13:
+  - rename touch_page to mark_page_accessed and uninline  (Christoph Hellwig)
+  - NUMA bugfix for __alloc_pages                         (William Irwin)
+  - kill __find_page                                      (Christoph Hellwig)
+  - make pte_chain_freelist per zone                      (William Irwin)
+  - protect pte_chains by per-page lock bit               (William Irwin)
+  - minor code cleanups                                   (me)
+rmap 12i:
+  - slab cleanup                                          (Christoph Hellwig)
+  - remove references to compiler.h from mm/*             (me)
+  - move rmap to marcelo's bk tree                        (me)
+  - minor cleanups                                        (me)
+rmap 12h:
+  - hopefully fix OOM detection algorithm                 (me)
+  - drop pte quicklist in anticipation of pte-highmem     (me)
+  - replace andrea's highmem emulation by ingo's one      (me)
+  - improve rss limit checking                            (Nick Piggin)
+rmap 12g:
+  - port to armv architecture                             (David Woodhouse)
+  - NUMA fix to zone_table initialisation                 (Samuel Ortiz)
+  - remove init_page_count                                (David Miller)
+rmap 12f:
+  - for_each_pgdat macro                                  (William Lee Irwin)
+  - put back EXPORT(__find_get_page) for modular rd       (me)
+  - make bdflush and kswapd actually start queued disk IO (me)
+rmap 12e
+  - RSS limit fix, the limit can be 0 for some reason     (me)
+  - clean up for_each_zone define to not need pgdata_t    (William Lee Irwin)
+  - fix i810_dma bug introduced with page->wait removal   (William Lee Irwin)
+rmap 12d:
+  - fix compiler warning in rmap.c                        (Roger Larsson)
+  - read latency improvement   (read-latency2)            (Andrew Morton)
+rmap 12c:
+  - fix small balancing bug in page_launder_zone          (Nick Piggin)
+  - wakeup_kswapd / wakeup_memwaiters code fix            (Arjan van de Ven)
+  - improve RSS limit enforcement                         (me)
+rmap 12b:
+  - highmem emulation (for debugging purposes)            (Andrea Arcangeli)
+  - ulimit RSS enforcement when memory gets tight         (me)
+  - sparc64 page->virtual quickfix                        (Greg Procunier)
+rmap 12a:
+  - fix the compile warning in buffer.c                   (me)
+  - fix divide-by-zero on highmem initialisation  DOH!    (me)
+  - remove the pgd quicklist (suspicious ...)             (DaveM, me)
+rmap 12:
+  - keep some extra free memory on large machines         (Arjan van de Ven, me)
+  - higher-order allocation bugfix                        (Adrian Drzewiecki)
+  - nr_free_buffer_pages() returns inactive + free mem    (me)
+  - pages from unused objects directly to inactive_clean  (me)
+  - use fast pte quicklists on non-pae machines           (Andrea Arcangeli)
+  - remove sleep_on from wakeup_kswapd                    (Arjan van de Ven)
+  - page waitqueue cleanup                                (Christoph Hellwig)
+rmap 11c:
+  - oom_kill race locking fix                             (Andres Salomon)
+  - elevator improvement                                  (Andrew Morton)
+  - dirty buffer writeout speedup (hopefully ;))          (me)
+  - small documentation updates                           (me)
+  - page_launder() never does synchronous IO, kswapd
+    and the processes calling it sleep on higher level    (me)
+  - deadlock fix in touch_page()                          (me)
+rmap 11b:
+  - added low latency reschedule points in vmscan.c       (me)
+  - make i810_dma.c include mm_inline.h too               (William Lee Irwin)
+  - wake up kswapd sleeper tasks on OOM kill so the
+    killed task can continue on its way out               (me)
+  - tune page allocation sleep point a little             (me)
+rmap 11a:
+  - don't let refill_inactive() progress count for OOM    (me)
+  - after an OOM kill, wait 5 seconds for the next kill   (me)
+  - agpgart_be fix for hashed waitqueues                  (William Lee Irwin)
+rmap 11:
+  - fix stupid logic inversion bug in wakeup_kswapd()     (Andrew Morton)
+  - fix it again in the morning                           (me)
+  - add #ifdef BROKEN_PPC_PTE_ALLOC_ONE to rmap.h, it
+    seems PPC calls pte_alloc() before mem_map[] init     (me)
+  - disable the debugging code in rmap.c ... the code
+    is working and people are running benchmarks          (me)
+  - let the slab cache shrink functions return a value
+    to help prevent early OOM killing                     (Ed Tomlinson)
+  - also, don't call the OOM code if we have enough
+    free pages                                            (me)
+  - move the call to lru_cache_del into __free_pages_ok   (Ben LaHaise)
+  - replace the per-page waitqueue with a hashed
+    waitqueue, reduces size of struct page from 64
+    bytes to 52 bytes (48 bytes on non-highmem machines)  (William Lee Irwin)
+rmap 10:
+  - fix the livelock for real (yeah right), turned out
+    to be a stupid bug in page_launder_zone()             (me)
+  - to make sure the VM subsystem doesn't monopolise
+    the CPU, let kswapd and some apps sleep a bit under
+    heavy stress situations                               (me)
+  - let __GFP_HIGH allocations dig a little bit deeper
+    into the free page pool, the SCSI layer seems fragile (me)
+rmap 9:
+  - improve comments all over the place                   (Michael Cohen)
+  - don't panic if page_remove_rmap() cannot find the
+    rmap in question, it's possible that the memory was
+    PG_reserved and belonging to a driver, but the driver
+    exited and cleared the PG_reserved bit                (me)
+  - fix the VM livelock by replacing > by >= in a few
+    critical places in the pageout code                   (me)
+  - treat the reclaiming of an inactive_clean page like
+    allocating a new page, calling try_to_free_pages()
+    and/or fixup_freespace() if required                  (me)
+  - when low on memory, don't make things worse by
+    doing swapin_readahead                                (me)
+rmap 8:
+  - add ANY_ZONE to the balancing functions to improve
+    kswapd's balancing a bit                              (me)
+  - regularize some of the maximum loop bounds in
+    vmscan.c for cosmetic purposes                        (William Lee Irwin)
+  - move page_address() to architecture-independent
+    code, now the removal of page->virtual is portable    (William Lee Irwin)
+  - speed up free_area_init_core() by doing a single
+    pass over the pages and not using atomic ops          (William Lee Irwin)
+  - documented the buddy allocator in page_alloc.c        (William Lee Irwin)
+rmap 7:
+  - clean up and document vmscan.c                        (me)
+  - reduce size of page struct, part one                  (William Lee Irwin)
+  - add rmap.h for other archs (untested, not for ARM)    (me)
+rmap 6:
+  - make the active and inactive_dirty list per zone,
+    this is finally possible because we can free pages
+    based on their physical address                       (William Lee Irwin)
+  - cleaned up William's code a bit                       (me)
+  - turn some defines into inlines and move those to
+    mm_inline.h (the includes are a mess ...)             (me)
+  - improve the VM balancing a bit                        (me)
+  - add back inactive_target to /proc/meminfo             (me)
+rmap 5:
+  - fixed recursive buglet, introduced by directly
+    editing the patch for making rmap 4 ;)))              (me)
+rmap 4:
+  - look at the referenced bits in page tables            (me)
+rmap 3:
+  - forgot one FASTCALL definition                        (me)
+rmap 2:
+  - teach try_to_unmap_one() about mremap()               (me)
+  - don't assign swap space to pages with buffers         (me)
+  - make the rmap.c functions FASTCALL / inline           (me)
+rmap 1:
+  - fix the swap leak in rmap 0                           (Dave McCracken)
+rmap 0:
+  - port of reverse mapping VM to 2.4.16                  (me)
diff -Nru a/Makefile b/Makefile
--- a/Makefile	Thu Apr 17 15:25:14 2003
+++ b/Makefile	Thu Apr 17 15:25:14 2003
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 21
-EXTRAVERSION = -pre7
+EXTRAVERSION = -pre7-rmap15g
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -Nru a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
--- a/arch/arm/mm/mm-armv.c	Thu Apr 17 15:25:14 2003
+++ b/arch/arm/mm/mm-armv.c	Thu Apr 17 15:25:14 2003
@@ -19,6 +19,7 @@
 #include <asm/pgalloc.h>
 #include <asm/page.h>
 #include <asm/setup.h>
+#include <asm/rmap.h>
 
 #include <asm/mach/map.h>
 
@@ -470,6 +471,7 @@
  * cache implementation.
  */
 kmem_cache_t *pte_cache;
+kmem_cache_t *pte_rmap_cache;
 
 /*
  * The constructor gets called for each object within the cache when the
@@ -480,6 +482,22 @@
 {
 	unsigned long block = (unsigned long)pte;
 
+	if (!(block & 2048)) {
+		/* First object of two in a page - allocate the 
+		   pte_rmap_info to go with them */
+
+		struct page * page = virt_to_page(pte);
+
+		if (flags & SLAB_CTOR_ATOMIC)
+			BUG();
+
+		page->mapping = kmem_cache_alloc(pte_rmap_cache, GFP_KERNEL);
+		if (!page->mapping) {
+			printk(KERN_CRIT "pte_rmap_cache alloc failed. Oops. Slab constructors need to be allowed to fail\n");
+			/* return -ENOMEM; */
+			BUG();
+		}
+	}
 	if (block & 2047)
 		BUG();
 
@@ -488,11 +506,32 @@
 			PTRS_PER_PTE * sizeof(pte_t), 0);
 }
 
+static void pte_cache_dtor(void *pte, kmem_cache_t *cache, unsigned long flags)
+{
+	unsigned long block = (unsigned long)pte;
+
+	if (!(block & 2048)) {
+		/* First object of two in a page - free the 
+		   pte_rmap_info that was associated with them */
+
+		struct page * page = virt_to_page(pte);
+
+		kmem_cache_free(pte_rmap_cache, page->mapping);
+		page->mapping = NULL;
+	}
+}
+
 void __init pgtable_cache_init(void)
 {
+	pte_rmap_cache = kmem_cache_create("pte-rmap-cache",
+				2 * sizeof(struct arm_rmap_info), 0, 0,
+				NULL, NULL);
+	if (!pte_rmap_cache)
+		BUG();
+
 	pte_cache = kmem_cache_create("pte-cache",
 				2 * PTRS_PER_PTE * sizeof(pte_t), 0, 0,
-				pte_cache_ctor, NULL);
+				pte_cache_ctor, pte_cache_dtor);
 	if (!pte_cache)
 		BUG();
 }
diff -Nru a/arch/i386/config.in b/arch/i386/config.in
--- a/arch/i386/config.in	Thu Apr 17 15:25:14 2003
+++ b/arch/i386/config.in	Thu Apr 17 15:25:14 2003
@@ -207,6 +207,7 @@
 	 64GB   CONFIG_HIGHMEM64G" off
 if [ "$CONFIG_HIGHMEM4G" = "y" -o "$CONFIG_HIGHMEM64G" = "y" ]; then
    define_bool CONFIG_HIGHMEM y
+   define_bool CONFIG_HIGHPTE y
 else
    define_bool CONFIG_HIGHMEM n
 fi
diff -Nru a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
--- a/arch/i386/kernel/vm86.c	Thu Apr 17 15:25:14 2003
+++ b/arch/i386/kernel/vm86.c	Thu Apr 17 15:25:14 2003
@@ -39,6 +39,7 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -121,7 +122,7 @@
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t *pte, *mapped;
 	int i;
 
 	spin_lock(&tsk->mm->page_table_lock);
@@ -141,12 +142,13 @@
 		pmd_clear(pmd);
 		goto out;
 	}
-	pte = pte_offset(pmd, 0xA0000);
+	mapped = pte = pte_offset_map(pmd, 0xA0000);
 	for (i = 0; i < 32; i++) {
 		if (pte_present(*pte))
-			set_pte(pte, pte_wrprotect(*pte));
+			ptep_set_wrprotect(pte);
 		pte++;
 	}
+	pte_unmap(mapped);
 out:
 	spin_unlock(&tsk->mm->page_table_lock);
 	flush_tlb();
diff -Nru a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile
--- a/arch/i386/mm/Makefile	Thu Apr 17 15:25:14 2003
+++ b/arch/i386/mm/Makefile	Thu Apr 17 15:25:14 2003
@@ -9,7 +9,7 @@
 
 O_TARGET := mm.o
 
-obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o
+obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o pgtable.o
 export-objs := pageattr.o
 
 include $(TOPDIR)/Rules.make
diff -Nru a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
--- a/arch/i386/mm/fault.c	Thu Apr 17 15:25:14 2003
+++ b/arch/i386/mm/fault.c	Thu Apr 17 15:25:14 2003
@@ -392,7 +392,7 @@
 			goto no_context;
 		set_pmd(pmd, *pmd_k);
 
-		pte_k = pte_offset(pmd_k, address);
+		pte_k = pte_offset_kernel(pmd_k, address);
 		if (!pte_present(*pte_k))
 			goto no_context;
 		return;
diff -Nru a/arch/i386/mm/init.c b/arch/i386/mm/init.c
--- a/arch/i386/mm/init.c	Thu Apr 17 15:25:14 2003
+++ b/arch/i386/mm/init.c	Thu Apr 17 15:25:14 2003
@@ -45,6 +45,8 @@
 
 int do_check_pgt_cache(int low, int high)
 {
+	return 0;	/* FIXME! */
+#if 0
 	int freed = 0;
 	if(pgtable_cache_size > high) {
 		do {
@@ -63,6 +65,7 @@
 		} while(pgtable_cache_size > low);
 	}
 	return freed;
+#endif
 }
 
 /*
@@ -76,7 +79,7 @@
 pgprot_t kmap_prot;
 
 #define kmap_get_fixmap_pte(vaddr)					\
-	pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
+	pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
 
 void __init kmap_init(void)
 {
@@ -90,36 +93,6 @@
 }
 #endif /* CONFIG_HIGHMEM */
 
-void show_mem(void)
-{
-	int i, total = 0, reserved = 0;
-	int shared = 0, cached = 0;
-	int highmem = 0;
-
-	printk("Mem-info:\n");
-	show_free_areas();
-	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
-	i = max_mapnr;
-	while (i-- > 0) {
-		total++;
-		if (PageHighMem(mem_map+i))
-			highmem++;
-		if (PageReserved(mem_map+i))
-			reserved++;
-		else if (PageSwapCache(mem_map+i))
-			cached++;
-		else if (page_count(mem_map+i))
-			shared += page_count(mem_map+i) - 1;
-	}
-	printk("%d pages of RAM\n", total);
-	printk("%d pages of HIGHMEM\n",highmem);
-	printk("%d reserved pages\n",reserved);
-	printk("%d pages shared\n",shared);
-	printk("%d pages swap cached\n",cached);
-	printk("%ld pages in page table cache\n",pgtable_cache_size);
-	show_buffers();
-}
-
 /* References to section boundaries */
 
 extern char _text, _etext, _edata, __bss_start, _end;
@@ -142,7 +115,7 @@
 		printk("PAE BUG #01!\n");
 		return;
 	}
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	/* <phys,flags> stored as-is, to permit clearing entries */
 	set_pte(pte, mk_pte_phys(phys, flags));
 
@@ -153,17 +126,6 @@
 	__flush_tlb_one(vaddr);
 }
 
-void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
-	unsigned long address = __fix_to_virt(idx);
-
-	if (idx >= __end_of_fixed_addresses) {
-		printk("Invalid __set_fixmap\n");
-		return;
-	}
-	set_pte_phys(address, phys, flags);
-}
-
 static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
 {
 	pgd_t *pgd;
@@ -193,7 +155,7 @@
 			if (pmd_none(*pmd)) {
 				pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 				set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
-				if (pte != pte_offset(pmd, 0))
+				if (pte != pte_offset_kernel(pmd, 0))
 					BUG();
 			}
 			vaddr += PMD_SIZE;
@@ -264,7 +226,7 @@
 				*pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL);
 			}
 			set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
-			if (pte_base != pte_offset(pmd, 0))
+			if (pte_base != pte_offset_kernel(pmd, 0))
 				BUG();
 
 		}
@@ -286,7 +248,7 @@
 
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	pkmap_page_table = pte;
 #endif
 
@@ -397,7 +359,7 @@
 
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	old_pte = *pte;
 	*pte = mk_pte_phys(0, PAGE_READONLY);
 	local_flush_tlb();
diff -Nru a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
--- a/arch/i386/mm/ioremap.c	Thu Apr 17 15:25:14 2003
+++ b/arch/i386/mm/ioremap.c	Thu Apr 17 15:25:14 2003
@@ -49,7 +49,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
diff -Nru a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
--- a/arch/i386/mm/pageattr.c	Thu Apr 17 15:25:14 2003
+++ b/arch/i386/mm/pageattr.c	Thu Apr 17 15:25:14 2003
@@ -29,7 +29,7 @@
 		return NULL; 
 	if (pmd_val(*pmd) & _PAGE_PSE) 
 		return (pte_t *)pmd; 
-    return pte_offset(pmd, address);
+    return pte_offset_kernel(pmd, address);
 } 
 
 static struct page *split_large_page(unsigned long address, pgprot_t prot)
diff -Nru a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/arch/i386/mm/pgtable.c	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,226 @@
+/*
+ *  linux/arch/i386/mm/pgtable.c
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+//#include <asm/tlbflush.h>
+
+void show_mem(void)
+{
+	int total = 0, reserved = 0;
+	int shared = 0, cached = 0;
+	int highmem = 0;
+	struct page *page;
+	pg_data_t *pgdat;
+	unsigned long i;
+
+	printk("Mem-info:\n");
+	show_free_areas();
+	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
+	for_each_pgdat(pgdat) {
+		for (i = 0; i < pgdat->node_size; ++i) {
+			page = pgdat->node_mem_map + i;
+			total++;
+			if (PageHighMem(page))
+				highmem++;
+			if (PageReserved(page))
+				reserved++;
+			else if (PageSwapCache(page))
+				cached++;
+			else if (page_count(page))
+				shared += page_count(page) - 1;
+		}
+	}
+	printk("%d pages of RAM\n", total);
+	printk("%d pages of HIGHMEM\n",highmem);
+	printk("%d reserved pages\n",reserved);
+	printk("%d pages shared\n",shared);
+	printk("%d pages swap cached\n",cached);
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + __pgd_offset(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pgd, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	/* <pfn,flags> stored as-is, to permit clearing entries */
+	set_pte(pte, pfn_pte(pfn, flags));
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
+		printk ("set_pmd_pfn: vaddr misaligned\n");
+		return; /* BUG(); */
+	}
+	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
+		printk ("set_pmd_pfn: pfn misaligned\n");
+		return; /* BUG(); */
+	}
+	pgd = swapper_pg_dir + __pgd_offset(vaddr);
+	if (pgd_none(*pgd)) {
+		printk ("set_pmd_pfn: pgd_none\n");
+		return; /* BUG(); */
+	}
+	pmd = pmd_offset(pgd, vaddr);
+	set_pmd(pmd, pfn_pmd(pfn, flags));
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
+{
+	unsigned long address = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+	int count = 0;
+	pte_t *pte;
+   
+   	do {
+		pte = (pte_t *) __get_free_page(GFP_KERNEL);
+		if (pte)
+			clear_page(pte);
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
+	return pte;
+}
+
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	int count = 0;
+	struct page *pte;
+   
+   	do {
+#if CONFIG_HIGHPTE
+		pte = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0);
+#else
+		pte = alloc_pages(GFP_KERNEL, 0);
+#endif
+		if (pte)
+			clear_highpage(pte);
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
+	return pte;
+}
+
+#if CONFIG_X86_PAE
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	int i;
+	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
+
+	if (pgd) {
+		for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+			unsigned long pmd = __get_free_page(GFP_KERNEL);
+			if (!pmd)
+				goto out_oom;
+			clear_page(pmd);
+			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
+		}
+		memcpy(pgd + USER_PTRS_PER_PGD,
+			swapper_pg_dir + USER_PTRS_PER_PGD,
+			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+	}
+	return pgd;
+out_oom:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
+	return NULL;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+	int i;
+
+	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
+}
+
+#else
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
+
+	if (pgd) {
+		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
+		memcpy(pgd + USER_PTRS_PER_PGD,
+			swapper_pg_dir + USER_PTRS_PER_PGD,
+			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+	}
+	return pgd;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+	free_page((unsigned long)pgd);
+}
+
+#endif /* CONFIG_X86_PAE */
+
diff -Nru a/arch/ppc64/kernel/htab.c b/arch/ppc64/kernel/htab.c
--- a/arch/ppc64/kernel/htab.c	Thu Apr 17 15:25:14 2003
+++ b/arch/ppc64/kernel/htab.c	Thu Apr 17 15:25:14 2003
@@ -288,7 +288,7 @@
 	if (!pgd_none(*pg)) {
 		pm = pmd_offset(pg, ea);
 		if (!pmd_none(*pm)) { 
-			pt = pte_offset(pm, ea);
+			pt = pte_offset_kernel(pm, ea);
 			pte = *pt;
 			if (!pte_present(pte))
 				pt = NULL;
diff -Nru a/arch/ppc64/kernel/pmc.c b/arch/ppc64/kernel/pmc.c
--- a/arch/ppc64/kernel/pmc.c	Thu Apr 17 15:25:14 2003
+++ b/arch/ppc64/kernel/pmc.c	Thu Apr 17 15:25:14 2003
@@ -248,7 +248,7 @@
 		 */
 		pgdp = pgd_offset_b(ea);
 		pmdp = pmd_alloc(&btmalloc_mm, pgdp, ea);
-		ptep = pte_alloc(&btmalloc_mm, pmdp, ea);
+		ptep = pte_alloc_kernel(&btmalloc_mm, pmdp, ea);
 		pte = *ptep;
 
 		/* Clear any old hpte and set the new linux pte */
diff -Nru a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
--- a/arch/ppc64/mm/init.c	Thu Apr 17 15:25:14 2003
+++ b/arch/ppc64/mm/init.c	Thu Apr 17 15:25:14 2003
@@ -107,6 +107,7 @@
 {
 	int freed = 0;
 
+#if 0
 	if (pgtable_cache_size > high) {
 		do {
 			if (pgd_quicklist)
@@ -117,6 +118,7 @@
 				free_page((unsigned long)pte_alloc_one_fast(0, 0)), ++freed;
 		} while (pgtable_cache_size > low);
 	}
+#endif
 	return freed;	
 }
 
@@ -246,7 +248,7 @@
 		spin_lock(&ioremap_mm.page_table_lock);
 		pgdp = pgd_offset_i(ea);
 		pmdp = pmd_alloc(&ioremap_mm, pgdp, ea);
-		ptep = pte_alloc(&ioremap_mm, pmdp, ea);
+		ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea);
 
 		pa = absolute_to_phys(pa);
 		set_pte(ptep, mk_pte_phys(pa & PAGE_MASK, __pgprot(flags)));
@@ -331,7 +333,7 @@
 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, vmaddr);
 		if (!pmd_none(*pmd)) {
-			ptep = pte_offset(pmd, vmaddr);
+			ptep = pte_offset_map(pmd, vmaddr);
 			/* Check if HPTE might exist and flush it if so */
 			if (pte_val(*ptep) & _PAGE_HASHPTE)
 				flush_hash_page(context, vmaddr, ptep);
@@ -383,7 +385,7 @@
 				if ( pmd_end > end )
 					pmd_end = end;
 				if ( !pmd_none( *pmd ) ) {
-					ptep = pte_offset( pmd, start );
+					ptep = pte_offset_map( pmd, start );
 					do {
 						if ( pte_val(*ptep) & _PAGE_HASHPTE )
 							flush_hash_page( context, start, ptep );
diff -Nru a/arch/s390/mm/init.c b/arch/s390/mm/init.c
--- a/arch/s390/mm/init.c	Thu Apr 17 15:25:14 2003
+++ b/arch/s390/mm/init.c	Thu Apr 17 15:25:14 2003
@@ -47,6 +47,9 @@
 
 int do_check_pgt_cache(int low, int high)
 {
+#if 1 /* No quicklists in rmap */
+	return 0;
+#else
         int freed = 0;
         if(pgtable_cache_size > high) {
                 do {
@@ -65,6 +68,7 @@
                 } while(pgtable_cache_size > low);
         }
         return freed;
+#endif
 }
 
 void show_mem(void)
diff -Nru a/arch/s390/mm/ioremap.c b/arch/s390/mm/ioremap.c
--- a/arch/s390/mm/ioremap.c	Thu Apr 17 15:25:14 2003
+++ b/arch/s390/mm/ioremap.c	Thu Apr 17 15:25:14 2003
@@ -54,7 +54,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
diff -Nru a/arch/s390x/mm/init.c b/arch/s390x/mm/init.c
--- a/arch/s390x/mm/init.c	Thu Apr 17 15:25:14 2003
+++ b/arch/s390x/mm/init.c	Thu Apr 17 15:25:14 2003
@@ -45,96 +45,6 @@
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
 char  empty_zero_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
 
-pmd_t *pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd)
-{
-        unsigned long addr = (unsigned long) pgd;
-        unsigned long *pgd_slot =  (unsigned long *) (addr & -8);
-        unsigned long offset = addr & 4;
-	pmd_t *new, *pmd2;
-	int i;
- 
-        if (offset == 0 && 
-	    ((*pgd_slot & _PGD_ENTRY_INV) != 0 ||
-	     (*pgd_slot & _PGD_ENTRY_LEN(2)) == 0)) {
-                /* Set lower pmd, upper pmd is empty. */
-                *pgd_slot = __pa(pmd) | _PGD_ENTRY_MASK |
-                                _PGD_ENTRY_OFF(0) | _PGD_ENTRY_LEN(1);
-                return pmd;
-        }
-        if (offset == 4 &&
-	    ((*pgd_slot & _PGD_ENTRY_INV) != 0 ||
-	     (*pgd_slot & _PGD_ENTRY_OFF(2)) != 0)) {
-                /* Lower pmd empty, set upper pmd. */
-                *pgd_slot = (__pa(pmd) - 0x2000) | _PGD_ENTRY_MASK |
-                                _PGD_ENTRY_OFF(2) | _PGD_ENTRY_LEN(3);
-                return pmd;
-        }
-        /* We have to enlarge the pmd to 16K if we arrive here. */
-	new = (pmd_t *) __get_free_pages(GFP_KERNEL, 2);
-	if (new == NULL) {
-		pmd_free(pmd);
-		return NULL;
-	}
-	/* Set the PG_arch_1 bit on the first and the third pmd page
-           so that pmd_free_fast can recognize pmds that have been
-           allocated with an order 2 allocation.  */
-	set_bit(PG_arch_1, &virt_to_page(new)->flags);
-	set_bit(PG_arch_1, &virt_to_page(new+PTRS_PER_PMD)->flags);
-	/* Now copy the two pmds to the new memory area. */
-	if (offset == 0) {
-		pmd2 = (pmd_t *)(*pgd_slot & PAGE_MASK) + PTRS_PER_PMD;
-		memcpy(new, pmd, sizeof(pmd_t)*PTRS_PER_PMD);
-		memcpy(new + PTRS_PER_PMD, pmd2, sizeof(pmd_t)*PTRS_PER_PMD);
-	} else {
-		pmd2 = (pmd_t *)(*pgd_slot & PAGE_MASK);
-		memcpy(new, pmd2, sizeof(pmd_t)*PTRS_PER_PMD);
-		memcpy(new + PTRS_PER_PMD, pmd, sizeof(pmd_t)*PTRS_PER_PMD);
-	}
-	*pgd_slot = __pa(new) | _PGD_ENTRY_MASK |
-			_PGD_ENTRY_OFF(0) | _PGD_ENTRY_LEN(3);
-	for (i = 0; i < PTRS_PER_PMD; i++) {
-		pmd_clear(pmd + i);
-		pmd_clear(pmd2 + i);
-	}
-	pmd_free(pmd);
-	pmd_free(pmd2);
-	return new;
-}
-
-void pmd_free_order2(pmd_t *pmd)
-{
-	pmd_t *pmd2 = (pmd_t *) ((unsigned long) pmd ^ 8192);
-
-	clear_bit(PG_arch_1, &virt_to_page(pmd)->flags);
-	if (test_bit(PG_arch_1, &virt_to_page(pmd2)->flags) == 0) {
-		/* The other pmd of the order 2 allocation has already
-		   been freed. Now we can release the order 2 allocation.  */
-		free_pages((unsigned long) pmd & ~8192, 2);
-	}
-}
-
-int do_check_pgt_cache(int low, int high)
-{
-        int freed = 0;
-        if(pgtable_cache_size > high) {
-                do {
-                        if(pgd_quicklist) {
-				free_pgd_slow(get_pgd_fast());
-				freed += 2;
-			}
-                        if(pmd_quicklist) {
-				pmd_free_slow(pmd_alloc_one_fast(NULL, 0));
-				freed += 2;
-			}
-                        if(pte_quicklist) {
-				pte_free_slow(pte_alloc_one_fast(NULL, 0));
-				freed += 1;
-			}
-                } while(pgtable_cache_size > low);
-        }
-        return freed;
-}
-
 void show_mem(void)
 {
         int i, total = 0,reserved = 0;
@@ -181,9 +91,13 @@
 void __init paging_init(void)
 {
 	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+        unsigned long pgdir_k = (__pa(swapper_pg_dir) & PAGE_MASK) |
+          _KERN_REGION_TABLE;
 	static const int ssm_mask = 0x04000000L;
 	unsigned long dma_pfn, address, end_mem;
         pgd_t * pg_dir;
+	pmd_t * pm_dir;
+	pte_t * pt_dir;
 	int     i,j,k;
 
 	dma_pfn = MAX_DMA_ADDRESS >> PAGE_SHIFT;
@@ -204,8 +118,7 @@
         pg_dir = swapper_pg_dir;
 	address = 0;
 	end_mem = (unsigned long) __va(max_low_pfn*PAGE_SIZE);
-        for (i = 0 ; i < PTRS_PER_PGD/2 ; i++, pg_dir += 2) {
-		pmd_t *pm_dir;
+        for (i = 0 ; i < PTRS_PER_PGD ; i++,pg_dir++) {
 
                 if (address >= end_mem) {
                         pgd_clear(pg_dir);
@@ -213,11 +126,9 @@
                 }
         
 	        pm_dir = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE*4);
-		*((unsigned long *) pg_dir) = __pa(pm_dir) | _PGD_ENTRY_MASK |
-			_PGD_ENTRY_LEN(3) | _PGD_ENTRY_OFF(0);
+                pgd_populate(&init_mm, pg_dir, pm_dir);
 
-                for (j = 0 ; j < PTRS_PER_PMD*2 ; j++, pm_dir++) {
-			pte_t *pt_dir;
+                for (j = 0 ; j < PTRS_PER_PMD ; j++,pm_dir++) {
 
                         if (address >= end_mem) {
                                 pmd_clear(pm_dir);
@@ -225,7 +136,7 @@
                         }          
                         
                         pt_dir = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-                        pmd_populate(&init_mm, pm_dir, pt_dir);
+                        pmd_populate_kernel(&init_mm, pm_dir, pt_dir);
 	
                         for (k = 0 ; k < PTRS_PER_PTE ; k++,pt_dir++) {
                                 pte_t pte = mk_pte_phys(address, PAGE_KERNEL);
@@ -249,8 +160,8 @@
                              "lctlg 7,7,%0\n\t"
                              "lctlg 13,13,%0\n\t"
                              "ssm   %1"
-			     : :"m" (__pa(swapper_pg_dir) | _KERN_REGION_TABLE),
-			        "m" (ssm_mask));
+			     : :"m" (pgdir_k), "m" (ssm_mask));
+
         local_flush_tlb();
 
         return;
diff -Nru a/arch/s390x/mm/ioremap.c b/arch/s390x/mm/ioremap.c
--- a/arch/s390x/mm/ioremap.c	Thu Apr 17 15:25:14 2003
+++ b/arch/s390x/mm/ioremap.c	Thu Apr 17 15:25:14 2003
@@ -54,7 +54,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
diff -Nru a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
--- a/arch/x86_64/mm/fault.c	Thu Apr 17 15:25:14 2003
+++ b/arch/x86_64/mm/fault.c	Thu Apr 17 15:25:14 2003
@@ -341,7 +341,7 @@
 		pmd = pmd_offset(pgd, address);
 		if (!pmd_present(*pmd))
 			goto bad_area_nosemaphore;
-		pte = pte_offset(pmd, address); 
+		pte = pte_offset_kernel(pmd, address); 
 		if (!pte_present(*pte))
 			goto bad_area_nosemaphore; 
 
diff -Nru a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- a/arch/x86_64/mm/init.c	Thu Apr 17 15:25:14 2003
+++ b/arch/x86_64/mm/init.c	Thu Apr 17 15:25:14 2003
@@ -142,12 +142,12 @@
 	if (pmd_none(*pmd)) {
 		pte = (pte_t *) spp_getpage();
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
-		if (pte != pte_offset(pmd, 0)) {
+		if (pte != pte_offset_kernel(pmd, 0)) {
 			printk("PAGETABLE BUG #02!\n");
 			return;
 		}
 	}
-	pte = pte_offset(pmd, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
 	set_pte(pte, mk_pte_phys(phys, prot));
 
 	/*
diff -Nru a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
--- a/arch/x86_64/mm/ioremap.c	Thu Apr 17 15:25:14 2003
+++ b/arch/x86_64/mm/ioremap.c	Thu Apr 17 15:25:14 2003
@@ -49,7 +49,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
diff -Nru a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
--- a/arch/x86_64/mm/pageattr.c	Thu Apr 17 15:25:14 2003
+++ b/arch/x86_64/mm/pageattr.c	Thu Apr 17 15:25:14 2003
@@ -23,7 +23,7 @@
 	if ((pmd_val(*pmd) & PAGE_LARGE) == PAGE_LARGE)
 		return (pte_t *)pmd; 
 
-        return pte_offset(pmd, address);
+        return pte_offset_kernel(pmd, address);
 } 
 
 static struct page *split_large_page(unsigned long address, pgprot_t prot)
diff -Nru a/drivers/char/drm/drm_proc.h b/drivers/char/drm/drm_proc.h
--- a/drivers/char/drm/drm_proc.h	Thu Apr 17 15:25:14 2003
+++ b/drivers/char/drm/drm_proc.h	Thu Apr 17 15:25:14 2003
@@ -448,7 +448,7 @@
 		for (i = vma->vm_start; i < vma->vm_end; i += PAGE_SIZE) {
 			pgd = pgd_offset(vma->vm_mm, i);
 			pmd = pmd_offset(pgd, i);
-			pte = pte_offset(pmd, i);
+			pte = pte_offset_map(pmd, i);
 			if (pte_present(*pte)) {
 				address = __pa(pte_page(*pte))
 					+ (i & (PAGE_SIZE-1));
@@ -464,6 +464,7 @@
 			} else {
 				DRM_PROC_PRINT("      0x%08lx\n", i);
 			}
+			pte_unmap(pte);
 		}
 #endif
 	}
diff -Nru a/drivers/char/mem.c b/drivers/char/mem.c
--- a/drivers/char/mem.c	Thu Apr 17 15:25:14 2003
+++ b/drivers/char/mem.c	Thu Apr 17 15:25:14 2003
@@ -542,7 +542,7 @@
 	pmd = pmd_offset(pgd, kaddr);
 	if (pmd_none(*pmd) || pmd_bad(*pmd))
 		goto out;
-	ptep = pte_offset(pmd, kaddr);
+	ptep = pte_offset_kernel(pmd, kaddr);
 	if (!ptep)
 		goto out;
 	pte = *ptep;
diff -Nru a/drivers/sgi/char/graphics.c b/drivers/sgi/char/graphics.c
--- a/drivers/sgi/char/graphics.c	Thu Apr 17 15:25:14 2003
+++ b/drivers/sgi/char/graphics.c	Thu Apr 17 15:25:14 2003
@@ -219,6 +219,7 @@
 {
 	pgd_t *pgd; pmd_t *pmd; pte_t *pte;
 	int board = GRAPHICS_CARD (vma->vm_dentry->d_inode->i_rdev);
+	struct page *page;
 
 	unsigned long virt_add, phys_add;
 
@@ -247,8 +248,10 @@
 
 	pgd = pgd_offset(current->mm, address);
 	pmd = pmd_offset(pgd, address);
-	pte = pte_offset(pmd, address);
-	return pte_page(*pte);
+	pte = pte_offset_map(pmd, address);
+	page = pte_page(*pte);
+	pte_unmap(pte);
+	return page;
 }
 
 /*
diff -Nru a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c	Thu Apr 17 15:25:14 2003
+++ b/fs/buffer.c	Thu Apr 17 15:25:14 2003
@@ -47,6 +47,7 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/completion.h>
+#include <linux/mm_inline.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -79,6 +80,8 @@
 static int nr_buffers_type[NR_LIST];
 static unsigned long size_buffers_type[NR_LIST];
 
+static LIST_HEAD(buffer_lru);
+static int nr_used_buffer_heads;
 static struct buffer_head * unused_list;
 static int nr_unused_buffer_heads;
 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
@@ -1134,6 +1137,8 @@
 {
 	if (unlikely(buffer_attached(bh)))
 		BUG();
+	list_del(&bh->lru);
+	nr_used_buffer_heads--;
 	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
 		kmem_cache_free(bh_cachep, bh);
 	} else {
@@ -1169,6 +1174,8 @@
 		bh = unused_list;
 		unused_list = bh->b_next_free;
 		nr_unused_buffer_heads--;
+		list_add(&bh->lru, &buffer_lru);
+		nr_used_buffer_heads++;
 		spin_unlock(&unused_list_lock);
 		return bh;
 	}
@@ -1181,6 +1188,10 @@
 	if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
 		bh->b_blocknr = -1;
 		bh->b_this_page = NULL;
+		spin_lock(&unused_list_lock);
+		list_add(&bh->lru, &buffer_lru);
+		nr_used_buffer_heads++;
+		spin_unlock(&unused_list_lock);
 		return bh;
 	}
 
@@ -1193,6 +1204,8 @@
 			bh = unused_list;
 			unused_list = bh->b_next_free;
 			nr_unused_buffer_heads--;
+			list_add(&bh->lru, &buffer_lru);
+			nr_used_buffer_heads++;
 			spin_unlock(&unused_list_lock);
 			return bh;
 		}
@@ -1335,6 +1348,45 @@
 }
 
 /*
+ * try_to_reclaim_buffers - get rid of buffer heads when the VM needs space
+ * @priority - reclaim priority
+ * @gfp_mask - page freeing mask
+ *
+ * We rotate the buffers on the buffer_lru list, trying to reclaim
+ * them. 
+ */
+int try_to_reclaim_buffers(int priority, unsigned int gfp_mask)
+{
+	int todo = nr_used_buffer_heads >> priority;
+	struct list_head * entry;
+	struct buffer_head * bh;
+	struct page * page;
+	int reclaimed = 0;
+
+	spin_lock(&unused_list_lock);
+	while (todo-- && !list_empty(&buffer_lru)) {
+		entry = buffer_lru.prev;
+		list_move(entry, &buffer_lru);
+		bh = list_entry(entry, struct buffer_head, lru);
+
+		page = bh->b_page;
+		if (TryLockPage(page))
+			continue;
+		page_cache_get(page);
+		spin_unlock(&unused_list_lock);
+
+		reclaimed += try_to_release_page(page, gfp_mask);
+
+		UnlockPage(page);
+		page_cache_release(page);
+		spin_lock(&unused_list_lock);
+	}
+	spin_unlock(&unused_list_lock);
+
+	return reclaimed;
+}
+
+/*
  * We don't have to release all buffers here, but
  * we have to be sure that no dirty buffer is left
  * and no IO is going on (no buffer is locked), because
@@ -2544,63 +2596,23 @@
 	return 1;
 }
 
-/*
- * The first time the VM inspects a page which has locked buffers, it
- * will just mark it as needing waiting upon on the scan of the page LRU.
- * BH_Wait_IO is used for this.
- *
- * The second time the VM visits the page, if it still has locked
- * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
- *
- * The third time the VM visits the page, if the I/O hasn't completed
- * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
- * used for this.
- *
- * There is also the case of buffers which were locked by someone else
- * - write(2) callers, bdflush, etc.  There can be a huge number of these
- * and we don't want to just skip them all and fail the page allocation. 
- * We want to be able to wait on these buffers as well.
- *
- * The BH_Launder bit is set in submit_bh() to indicate that I/O is
- * underway against the buffer, doesn't matter who started it - we know
- * that the buffer will eventually come unlocked, and so it's safe to
- * wait on it.
- *
- * The caller holds the page lock and the caller will free this page
- * into current->local_page, so by waiting on the page's buffers the
- * caller is guaranteed to obtain this page.
- *
- * sync_page_buffers() will sort-of return true if all the buffers
- * against this page are freeable, so try_to_free_buffers() should
- * try to free the page's buffers a second time.  This is a bit
- * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
- */
-static int sync_page_buffers(struct buffer_head *head)
+static void sync_page_buffers(struct buffer_head *head)
 {
 	struct buffer_head * bh = head;
-	int tryagain = 1;
 
 	do {
 		if (!buffer_dirty(bh) && !buffer_locked(bh))
 			continue;
 
 		/* Don't start IO first time around.. */
-		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
-			tryagain = 0;
+		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
 			continue;
-		}
 
-		/* Second time through we start actively writing out.. */
-		if (test_and_set_bit(BH_Lock, &bh->b_state)) {
-			if (unlikely(!buffer_launder(bh))) {
-				tryagain = 0;
-				continue;
-			}
-			wait_on_buffer(bh);
-			tryagain = 1;
+		/* If we cannot lock the buffer just skip it. */
+		if (test_and_set_bit(BH_Lock, &bh->b_state))
 			continue;
-		}
 
+		/* Second time through we start actively writing out.. */
 		if (!atomic_set_buffer_clean(bh)) {
 			unlock_buffer(bh);
 			continue;
@@ -2610,10 +2622,9 @@
 		get_bh(bh);
 		bh->b_end_io = end_buffer_io_sync;
 		submit_bh(WRITE, bh);
-		tryagain = 0;
 	} while ((bh = bh->b_this_page) != head);
 
-	return tryagain;
+	return;
 }
 
 /*
@@ -2637,7 +2648,6 @@
 {
 	struct buffer_head * tmp, * bh = page->buffers;
 
-cleaned_buffers_try_again:
 	spin_lock(&lru_list_lock);
 	write_lock(&hash_table_lock);
 	tmp = bh;
@@ -2680,15 +2690,9 @@
 	write_unlock(&hash_table_lock);
 	spin_unlock(&lru_list_lock);
 	gfp_mask = pf_gfp_mask(gfp_mask);
-	if (gfp_mask & __GFP_IO) {
-		if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
-			if (sync_page_buffers(bh)) {
-				/* no IO or waiting next time */
-				gfp_mask = 0;
-				goto cleaned_buffers_try_again;
-			}
-		}
-	}
+	if ((gfp_mask & __GFP_IO) &&
+			((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)))
+		sync_page_buffers(bh);
 	if (balance_dirty_state() >= 0)
 		wakeup_bdflush();
 	return 0;
@@ -2953,9 +2957,35 @@
 				break;
 			ndirty -= NRSYNC;
 		}
-		if (ndirty > 0 || bdflush_stop())
+		if (ndirty > 0 || bdflush_stop()) {
+			run_task_queue(&tq_disk);
 			interruptible_sleep_on(&bdflush_wait);
+		}
+	}
+}
+
+
+/*
+ * Do some IO post-processing here!!!
+ */
+void do_io_postprocessing(void)
+{
+	int i;
+	struct buffer_head *bh, *next;
+
+	spin_lock(&lru_list_lock);
+	bh = lru_list[BUF_LOCKED];
+	if (bh) {
+		for (i = nr_buffers_type[BUF_LOCKED]; i-- > 0; bh = next) {
+			next = bh->b_next_free;
+
+			if (!buffer_locked(bh)) 
+				__refile_buffer(bh);
+			else 
+				break;
+		}
 	}
+	spin_unlock(&lru_list_lock);
 }
 
 /*
@@ -3009,6 +3039,7 @@
 #ifdef DEBUG
 		printk(KERN_DEBUG "kupdate() activated...\n");
 #endif
+		do_io_postprocessing();
 		sync_old_buffers();
 		run_task_queue(&tq_disk);
 	}
diff -Nru a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c	Thu Apr 17 15:25:14 2003
+++ b/fs/exec.c	Thu Apr 17 15:25:14 2003
@@ -286,33 +286,45 @@
 	pgd_t * pgd;
 	pmd_t * pmd;
 	pte_t * pte;
+	struct pte_chain * pte_chain;
 
 	if (page_count(page) != 1)
 		printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
 	pgd = pgd_offset(tsk->mm, address);
 
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		goto out_nounlock;
+
 	spin_lock(&tsk->mm->page_table_lock);
 	pmd = pmd_alloc(tsk->mm, pgd, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc(tsk->mm, pmd, address);
+	pte = pte_alloc_map(tsk->mm, pmd, address);
 	if (!pte)
 		goto out;
-	if (!pte_none(*pte))
+	if (!pte_none(*pte)) {
+		pte_unmap(pte);
 		goto out;
+	}
 	lru_cache_add(page);
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+	pte_chain = page_add_rmap(page, pte, pte_chain);
 	tsk->mm->rss++;
+	pte_unmap(pte);
 	spin_unlock(&tsk->mm->page_table_lock);
 
 	/* no need for flush_tlb */
+	pte_chain_free(pte_chain);
 	return;
 out:
 	spin_unlock(&tsk->mm->page_table_lock);
+out_nounlock:
 	__free_page(page);
 	force_sig(SIGKILL, tsk);
+	pte_chain_free(pte_chain);
 	return;
 }
 
diff -Nru a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c	Thu Apr 17 15:25:14 2003
+++ b/fs/inode.c	Thu Apr 17 15:25:14 2003
@@ -49,7 +49,8 @@
  * other linked list is the "type" list:
  *  "in_use" - valid inode, i_count > 0, i_nlink > 0
  *  "dirty"  - as "in_use" but also dirty
- *  "unused" - valid inode, i_count = 0
+ *  "unused" - valid inode, i_count = 0, no pages in the pagecache
+ *  "unused_pagecache" - valid inode, i_count = 0 but has pages in the pagecache
  *
  * A "dirty" list is maintained for each super block,
  * allowing for low-overhead inode sync() operations.
@@ -57,6 +58,7 @@
 
 static LIST_HEAD(inode_in_use);
 static LIST_HEAD(inode_unused);
+static LIST_HEAD(inode_unused_pagecache);
 static struct list_head *inode_hashtable;
 static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
 
@@ -254,6 +256,37 @@
 	inodes_stat.nr_unused--;
 }
 
+static inline void __refile_inode(struct inode *inode)
+{
+	struct list_head *to;
+	
+	if (inode->i_state & I_FREEING)
+		return;
+	if (list_empty(&inode->i_hash))
+		return;
+		
+	if (inode->i_state & I_DIRTY)
+		to = &inode->i_sb->s_dirty;
+	else if (atomic_read(&inode->i_count))
+		to = &inode_in_use;
+	else if (inode->i_data.nrpages)
+		to = &inode_unused_pagecache;
+	else
+		to = &inode_unused;
+	list_del(&inode->i_list);
+	list_add(&inode->i_list, to);
+}
+
+void refile_inode(struct inode *inode)
+{
+/*	if (in_interrupt())
+		BUG(); */
+	if (!inode) return;
+	spin_lock(&inode_lock);
+	__refile_inode(inode);
+	spin_unlock(&inode_lock);
+}
+
 static inline void __sync_one(struct inode *inode, int sync)
 {
 	unsigned dirty;
@@ -280,17 +313,8 @@
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_LOCK;
-	if (!(inode->i_state & I_FREEING)) {
-		struct list_head *to;
-		if (inode->i_state & I_DIRTY)
-			to = &inode->i_sb->s_dirty;
-		else if (atomic_read(&inode->i_count))
-			to = &inode_in_use;
-		else
-			to = &inode_unused;
-		list_del(&inode->i_list);
-		list_add(&inode->i_list, to);
-	}
+	if (!(inode->i_state & I_FREEING)) 
+		__refile_inode(inode);
 	wake_up(&inode->i_wait);
 }
 
@@ -659,6 +683,7 @@
 	spin_lock(&inode_lock);
 	busy = invalidate_list(&inode_in_use, sb, &throw_away);
 	busy |= invalidate_list(&inode_unused, sb, &throw_away);
+	busy |= invalidate_list(&inode_unused_pagecache, sb, &throw_away);
 	busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
 	busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
 	spin_unlock(&inode_lock);
@@ -734,8 +759,7 @@
 		if (atomic_read(&inode->i_count))
 			continue;
 		list_del(tmp);
-		list_del(&inode->i_hash);
-		INIT_LIST_HEAD(&inode->i_hash);
+		list_del_init(&inode->i_hash);
 		list_add(tmp, freeable);
 		inode->i_state |= I_FREEING;
 		count++;
@@ -1063,10 +1087,8 @@
 				BUG();
 		} else {
 			if (!list_empty(&inode->i_hash)) {
-				if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
-					list_del(&inode->i_list);
-					list_add(&inode->i_list, &inode_unused);
-				}
+				if (!(inode->i_state & (I_DIRTY|I_LOCK))) 
+					__refile_inode(inode);
 				inodes_stat.nr_unused++;
 				spin_unlock(&inode_lock);
 				if (!sb || (sb->s_flags & MS_ACTIVE))
@@ -1221,6 +1243,11 @@
 			remove_inode_dquot_ref(inode, type, &tofree_head);
 	}
 	list_for_each(act_head, &inode_unused) {
+		inode = list_entry(act_head, struct inode, i_list);
+		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+			remove_inode_dquot_ref(inode, type, &tofree_head);
+	}
+	list_for_each(act_head, &inode_unused_pagecache) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
 			remove_inode_dquot_ref(inode, type, &tofree_head);
diff -Nru a/fs/proc/array.c b/fs/proc/array.c
--- a/fs/proc/array.c	Thu Apr 17 15:25:14 2003
+++ b/fs/proc/array.c	Thu Apr 17 15:25:14 2003
@@ -399,7 +399,7 @@
 static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
 	int * pages, int * shared, int * dirty, int * total)
 {
-	pte_t * pte;
+	pte_t * pte, *mapping;
 	unsigned long end;
 
 	if (pmd_none(*pmd))
@@ -409,7 +409,7 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
+	mapping = pte = pte_offset_map(pmd, address);
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -434,6 +434,7 @@
 		if (page_count(pte_page(page)) > 1)
 			++*shared;
 	} while (address < end);
+	pte_unmap(mapping);
 }
 
 static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c	Thu Apr 17 15:25:14 2003
+++ b/fs/proc/proc_misc.c	Thu Apr 17 15:25:14 2003
@@ -37,6 +37,7 @@
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/sysrq.h>
+#include <linux/mm_inline.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -186,7 +187,12 @@
 		"Cached:       %8lu kB\n"
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8u kB\n"
-		"Inactive:     %8u kB\n"
+		"ActiveAnon:   %8u kB\n"
+		"ActiveCache:  %8u kB\n"
+		"Inact_dirty:  %8u kB\n"
+		"Inact_laundry:%8u kB\n"
+		"Inact_clean:  %8u kB\n"
+		"Inact_target: %8u kB\n"
 		"HighTotal:    %8lu kB\n"
 		"HighFree:     %8lu kB\n"
 		"LowTotal:     %8lu kB\n"
@@ -199,8 +205,13 @@
 		K(i.bufferram),
 		K(pg_size - swapper_space.nrpages),
 		K(swapper_space.nrpages),
-		K(nr_active_pages),
-		K(nr_inactive_pages),
+		K(nr_active_anon_pages()) + K(nr_active_cache_pages()),
+		K(nr_active_anon_pages()),
+		K(nr_active_cache_pages()),
+		K(nr_inactive_dirty_pages()),
+		K(nr_inactive_laundry_pages()),
+		K(nr_inactive_clean_pages()),
+		K(inactive_target()),
 		K(i.totalhigh),
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
diff -Nru a/include/asm-alpha/rmap.h b/include/asm-alpha/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-alpha/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _ALPHA_RMAP_H
+#define _ALPHA_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-arm/proc-armv/rmap.h b/include/asm-arm/proc-armv/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-arm/proc-armv/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,72 @@
+#ifndef _ARMV_RMAP_H
+#define _ARMV_RMAP_H
+/*
+ * linux/include/asm-arm/proc-armv/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ *
+ * We use the struct page of the page table page to find a pointer
+ * to an array of two 'struct arm_rmap_info's, one for each of the
+ * two page tables in each page.
+ * 
+ * - rmi->mm points to the process' mm_struct
+ * - rmi->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ *   offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+struct arm_rmap_info {
+	struct mm_struct *mm;
+	unsigned long index;
+};
+
+static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	rmi->mm = mm;
+	rmi->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	rmi->mm = NULL;
+	rmi->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	return rmi->mm;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+	unsigned long low_bits;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+	return rmi->index + low_bits;
+}
+
+#endif /* _ARMV_RMAP_H */
diff -Nru a/include/asm-arm/rmap.h b/include/asm-arm/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-arm/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,6 @@
+#ifndef _ARM_RMAP_H
+#define _ARM_RMAP_H
+
+#include <asm/proc/rmap.h>
+
+#endif /* _ARM_RMAP_H */
diff -Nru a/include/asm-cris/rmap.h b/include/asm-cris/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-cris/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _CRIS_RMAP_H
+#define _CRIS_RMAP_H
+
+/* nothing to see, move along :) */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-generic/rmap.h b/include/asm-generic/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-generic/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,90 @@
+#ifndef _GENERIC_RMAP_H
+#define _GENERIC_RMAP_H
+/*
+ * linux/include/asm-generic/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ * this version should work for most architectures with a
+ * 'normal' page table layout.
+ *
+ * We use the struct page of the page table page to find out
+ * the process and full address of a page table entry:
+ * - page->mapping points to the process' mm_struct
+ * - page->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ *   offset of the page table entry within the page table page
+ *
+ * For CONFIG_HIGHPTE, we need to represent the address of a pte in a
+ * scalar pte_addr_t.  The pfn of the pte's page is shifted left by PAGE_SIZE
+ * bits and is then ORed with the byte offset of the pte within its page.
+ *
+ * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits.  20 for the pfn, 12 for
+ * the offset.
+ *
+ * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits.  52 for the pfn, 12 for
+ * the offset.
+ */
+#include <linux/mm.h>
+
+static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address)
+{
+#ifdef BROKEN_PPC_PTE_ALLOC_ONE
+	/* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
+	extern int mem_init_done;
+
+	if (!mem_init_done)
+		return;
+#endif
+	page->mapping = (void *)mm;
+	page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(struct page * page)
+{
+	page->mapping = NULL;
+	page->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+	struct page * page = kmap_atomic_to_page(ptep);
+
+	return (struct mm_struct *) page->mapping;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+	struct page * page = kmap_atomic_to_page(ptep);
+	unsigned long low_bits;
+
+	low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+	return page->index + low_bits;
+}
+
+#if CONFIG_HIGHPTE
+static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
+{
+	pte_addr_t paddr;
+	paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT;
+	return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK);
+}
+#else
+static inline pte_addr_t ptep_to_paddr(pte_t *ptep)
+{
+	return (pte_addr_t)ptep;
+}
+#endif
+
+#ifndef CONFIG_HIGHPTE
+static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr)
+{
+	return (pte_t *)pte_paddr;
+}
+
+static inline void rmap_ptep_unmap(pte_t *pte)
+{
+	return;
+}
+#endif
+
+#endif /* _GENERIC_RMAP_H */
diff -Nru a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
--- a/include/asm-i386/fixmap.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/fixmap.h	Thu Apr 17 15:25:14 2003
@@ -101,6 +101,7 @@
 #define FIXADDR_START	(FIXADDR_TOP - __FIXADDR_SIZE)
 
 #define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
 
 extern void __this_fixmap_does_not_exist(void);
 
@@ -124,6 +125,12 @@
 		__this_fixmap_does_not_exist();
 
         return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+	return __virt_to_fix(vaddr);
 }
 
 #endif
diff -Nru a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h
--- a/include/asm-i386/highmem.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/highmem.h	Thu Apr 17 15:25:14 2003
@@ -127,6 +127,20 @@
 #endif
 }
 
+static inline struct page *kmap_atomic_to_page(void *ptr)
+{
+	unsigned long idx, vaddr = (unsigned long)ptr;
+	pte_t *pte;
+
+	if (vaddr < FIXADDR_START)
+		return virt_to_page(ptr);
+
+	idx = virt_to_fix(vaddr);
+	pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+	return pte_page(*pte);
+}
+
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
diff -Nru a/include/asm-i386/kmap_types.h b/include/asm-i386/kmap_types.h
--- a/include/asm-i386/kmap_types.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/kmap_types.h	Thu Apr 17 15:25:14 2003
@@ -8,6 +8,9 @@
 	KM_USER0,
 	KM_USER1,
 	KM_BH_IRQ,
+	KM_PTE0,
+	KM_PTE1,
+	KM_PTE2,
 	KM_TYPE_NR
 };
 
diff -Nru a/include/asm-i386/page.h b/include/asm-i386/page.h
--- a/include/asm-i386/page.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/page.h	Thu Apr 17 15:25:14 2003
@@ -131,7 +131,13 @@
 #define MAXMEM			((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE))
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
-#define virt_to_page(kaddr)	(mem_map + (__pa(kaddr) >> PAGE_SHIFT))
+#define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
+#ifndef CONFIG_DISCONTIGMEM
+#define pfn_to_page(pfn)	(mem_map + (pfn))
+#define page_to_pfn(page)	((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)		((pfn) < max_mapnr)
+#endif /* !CONFIG_DISCONTIGMEM */
+#define virt_to_page(kaddr)     pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define VALID_PAGE(page)	((page - mem_map) < max_mapnr)
 
 #define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | VM_EXEC | \
diff -Nru a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
--- a/include/asm-i386/pgalloc.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/pgalloc.h	Thu Apr 17 15:25:14 2003
@@ -5,143 +5,47 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <linux/threads.h>
+#include <linux/mm.h>		/* for struct page */
 
 #define pgd_quicklist (current_cpu_data.pgd_quick)
 #define pmd_quicklist (current_cpu_data.pmd_quick)
 #define pte_quicklist (current_cpu_data.pte_quick)
 #define pgtable_cache_size (current_cpu_data.pgtable_cache_sz)
 
-#define pmd_populate(mm, pmd, pte) \
+#define pmd_populate_kernel(mm, pmd, pte) \
 		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
 
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+	set_pmd(pmd, __pmd(_PAGE_TABLE +
+		((unsigned long long)page_to_pfn(pte) <<
+			(unsigned long long) PAGE_SHIFT)));
+}
 /*
  * Allocate and free page tables.
  */
 
-#if defined (CONFIG_X86_PAE)
-/*
- * We can't include <linux/slab.h> here, thus these uglinesses.
- */
-struct kmem_cache_s;
-
-extern struct kmem_cache_s *pae_pgd_cachep;
-extern void *kmem_cache_alloc(struct kmem_cache_s *, int);
-extern void kmem_cache_free(struct kmem_cache_s *, void *);
-
-
-static inline pgd_t *get_pgd_slow(void)
-{
-	int i;
-	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
-
-	if (pgd) {
-		for (i = 0; i < USER_PTRS_PER_PGD; i++) {
-			unsigned long pmd = __get_free_page(GFP_KERNEL);
-			if (!pmd)
-				goto out_oom;
-			clear_page(pmd);
-			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
-		}
-		memcpy(pgd + USER_PTRS_PER_PGD,
-			swapper_pg_dir + USER_PTRS_PER_PGD,
-			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-	}
-	return pgd;
-out_oom:
-	for (i--; i >= 0; i--)
-		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
-	kmem_cache_free(pae_pgd_cachep, pgd);
-	return NULL;
-}
-
-#else
-
-static inline pgd_t *get_pgd_slow(void)
-{
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
-
-	if (pgd) {
-		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
-		memcpy(pgd + USER_PTRS_PER_PGD,
-			swapper_pg_dir + USER_PTRS_PER_PGD,
-			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-	}
-	return pgd;
-}
-
-#endif /* CONFIG_X86_PAE */
-
-static inline pgd_t *get_pgd_fast(void)
-{
-	unsigned long *ret;
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(pgd_t *pgd);
 
-	if ((ret = pgd_quicklist) != NULL) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		pgtable_cache_size--;
-	} else
-		ret = (unsigned long *)get_pgd_slow();
-	return (pgd_t *)ret;
-}
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
 
-static inline void free_pgd_fast(pgd_t *pgd)
-{
-	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	pgtable_cache_size++;
-}
+#define pte_alloc_one_fast(mm, address)		(0)
+#define pmd_alloc_one_fast(mm, address)		(0)
 
-static inline void free_pgd_slow(pgd_t *pgd)
+static inline void pte_free_kernel(pte_t *pte)
 {
-#if defined(CONFIG_X86_PAE)
-	int i;
-
-	for (i = 0; i < USER_PTRS_PER_PGD; i++)
-		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
-	kmem_cache_free(pae_pgd_cachep, pgd);
-#else
-	free_page((unsigned long)pgd);
-#endif
-}
-
-static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-	pte_t *pte;
-
-	pte = (pte_t *) __get_free_page(GFP_KERNEL);
-	if (pte)
-		clear_page(pte);
-	return pte;
-}
-
-static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm,
-					unsigned long address)
-{
-	unsigned long *ret;
-
-	if ((ret = (unsigned long *)pte_quicklist) != NULL) {
-		pte_quicklist = (unsigned long *)(*ret);
-		ret[0] = ret[1];
-		pgtable_cache_size--;
-	}
-	return (pte_t *)ret;
+	free_page((unsigned long)pte);
 }
 
-static inline void pte_free_fast(pte_t *pte)
+static inline void pte_free(struct page *pte)
 {
-	*(unsigned long *)pte = (unsigned long) pte_quicklist;
-	pte_quicklist = (unsigned long *) pte;
-	pgtable_cache_size++;
+	__free_page(pte);
 }
 
-static __inline__ void pte_free_slow(pte_t *pte)
-{
-	free_page((unsigned long)pte);
-}
 
-#define pte_free(pte)		pte_free_fast(pte)
-#define pgd_free(pgd)		free_pgd_slow(pgd)
-#define pgd_alloc(mm)		get_pgd_fast()
+#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
 
 /*
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
@@ -149,11 +53,9 @@
  * (In the PAE case we free the pmds as part of the pgd.)
  */
 
-#define pmd_alloc_one_fast(mm, addr)	({ BUG(); ((pmd_t *)1); })
 #define pmd_alloc_one(mm, addr)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free_slow(x)		do { } while (0)
-#define pmd_free_fast(x)		do { } while (0)
 #define pmd_free(x)			do { } while (0)
+#define __pmd_free_tlb(tlb,x)		do { } while (0)
 #define pgd_populate(mm, pmd, pte)	BUG()
 
 extern int do_check_pgt_cache(int, int);
diff -Nru a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
--- a/include/asm-i386/pgtable-2level.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/pgtable-2level.h	Thu Apr 17 15:25:14 2003
@@ -60,6 +60,10 @@
 #define pte_same(a, b)		((a).pte_low == (b).pte_low)
 #define pte_page(x)		(mem_map+((unsigned long)(((x).pte_low >> PAGE_SHIFT))))
 #define pte_none(x)		(!(x).pte_low)
-#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
+#define pte_pfn(x)		((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
+#define pfn_pte(pfn, prot)	__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pfn_pmd(pfn, prot)	__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+
+#define __mk_pte(nr,prot)	pfn_pte(nr,prot)
 
 #endif /* _I386_PGTABLE_2LEVEL_H */
diff -Nru a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
--- a/include/asm-i386/pgtable-3level.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/pgtable-3level.h	Thu Apr 17 15:25:14 2003
@@ -89,10 +89,12 @@
 	return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
 }
 
-#define pte_page(x)	(mem_map+(((x).pte_low >> PAGE_SHIFT) | ((x).pte_high << (32 - PAGE_SHIFT))))
+#define pte_page(x)	pfn_to_page(pte_pfn(x))
 #define pte_none(x)	(!(x).pte_low && !(x).pte_high)
+#define pte_pfn(x)	(((x).pte_low >> PAGE_SHIFT) | ((x).pte_high << (32 - PAGE_SHIFT)))
 
-static inline pte_t __mk_pte(unsigned long page_nr, pgprot_t pgprot)
+#define __mk_pte(nr,prot)	pfn_pte(nr,prot)
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
 	pte_t pte;
 
@@ -100,5 +102,12 @@
 	pte.pte_low = (page_nr << PAGE_SHIFT) | pgprot_val(pgprot);
 	return pte;
 }
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
+}
+
+extern struct kmem_cache_s *pae_pgd_cachep;
 
 #endif /* _I386_PGTABLE_3LEVEL_H */
diff -Nru a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
--- a/include/asm-i386/pgtable.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-i386/pgtable.h	Thu Apr 17 15:25:14 2003
@@ -320,9 +320,13 @@
 
 #define page_pte(page) page_pte_prot(page, __pgprot(0))
 
-#define pmd_page(pmd) \
+#define pmd_page_kernel(pmd) \
 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
+#ifndef CONFIG_DISCONTIGMEM
+#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+#endif /* !CONFIG_DISCONTIGMEM */
+
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 
@@ -339,8 +343,35 @@
 /* Find an entry in the third-level page table.. */
 #define __pte_offset(address) \
 		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset(dir, address) ((pte_t *) pmd_page(*(dir)) + \
-			__pte_offset(address))
+#define pte_offset_kernel(dir, address) \
+	((pte_t *) pmd_page_kernel(*(dir)) +  __pte_offset(address))
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address) \
+	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address))
+#define pte_offset_map_nested(dir, address) \
+	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address))
+#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
+#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
+#else
+#define pte_offset_map(dir, address) \
+        ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
+#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { } while (0)
+#endif
+
+#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G)
+typedef u32 pte_addr_t;
+#endif
+
+#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G)
+typedef u64 pte_addr_t;
+#endif
+
+#if !defined(CONFIG_HIGHPTE)
+typedef pte_t *pte_addr_t;
+#endif
 
 /*
  * The i386 doesn't have any external MMU info: the kernel page
diff -Nru a/include/asm-i386/rmap.h b/include/asm-i386/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-i386/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,21 @@
+#ifndef _I386_RMAP_H
+#define _I386_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#ifdef CONFIG_HIGHPTE
+static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr)
+{
+	unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT);
+	unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK;
+	return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off);
+}
+
+static inline void rmap_ptep_unmap(pte_t *pte)
+{
+	kunmap_atomic(pte, KM_PTE2);
+}
+#endif
+
+#endif
diff -Nru a/include/asm-ia64/rmap.h b/include/asm-ia64/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-ia64/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _IA64_RMAP_H
+#define _IA64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-m68k/rmap.h b/include/asm-m68k/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-m68k/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _M68K_RMAP_H
+#define _M68K_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-mips/rmap.h b/include/asm-mips/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-mips/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _MIPS_RMAP_H
+#define _MIPS_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-mips64/rmap.h b/include/asm-mips64/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-mips64/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _MIPS64_RMAP_H
+#define _MIPS64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-ppc/rmap.h b/include/asm-ppc/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-ppc/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,9 @@
+#ifndef _PPC_RMAP_H
+#define _PPC_RMAP_H
+
+/* PPC calls pte_alloc() before mem_map[] is setup ... */
+#define BROKEN_PPC_PTE_ALLOC_ONE
+
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
--- a/include/asm-ppc64/page.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-ppc64/page.h	Thu Apr 17 15:25:14 2003
@@ -234,6 +234,17 @@
 #define __a2p(x) ((void *) absolute_to_phys(x))
 #define __a2v(x) ((void *) __va(absolute_to_phys(x)))
 
+#ifdef CONFIG_DISCONTIGMEM 
+#define page_to_pfn(page)       discontigmem_page_to_pfn(page)
+#define pfn_to_page(pfn)        discontigmem_pfn_to_page(pfn)
+#define pfn_valid(pfn)          discontigmem_pfn_valid(pfn)
+#else
+#define pfn_to_page(pfn)        (mem_map + (pfn))
+#define page_to_pfn(page)       ((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)          ((pfn) < max_mapnr)
+#endif
+
+
 #define virt_to_page(kaddr) (mem_map+(__pa((unsigned long)kaddr) >> PAGE_SHIFT))
 
 #define VALID_PAGE(page)    ((page - mem_map) < max_mapnr)
diff -Nru a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h
--- a/include/asm-ppc64/pgalloc.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-ppc64/pgalloc.h	Thu Apr 17 15:25:14 2003
@@ -80,6 +80,36 @@
 	return pmd;
 }
 
+#define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte)
+#define pmd_populate(mm, pmd, pte_page) \
+        pmd_populate_kernel(mm, pmd, page_address(pte_page))
+
+static inline pte_t *
+pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
+{
+        int count = 0;
+        pte_t *pte;
+
+        do {
+                pte = (pte_t *)__get_free_page(GFP_KERNEL);
+                if (pte)
+                        clear_page(pte);
+                else {
+                        current->state = TASK_UNINTERRUPTIBLE;
+                        schedule_timeout(HZ);
+                }
+        } while (!pte && (count++ < 10));
+
+        return pte;
+}
+
+static inline void 
+pte_free_kernel(pte_t *pte)
+{
+        free_page((unsigned long)pte);
+}
+
+
 static inline void
 pmd_free (pmd_t *pmd)
 {
@@ -88,39 +118,20 @@
 	++pgtable_cache_size;
 }
 
-#define pmd_populate(MM, PMD, PTE)	pmd_set(PMD, PTE)
+#define pte_alloc_one_fast(mm, address)		(0)
 
-static inline pte_t*
-pte_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
+static inline struct page * 
+pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	unsigned long *ret = (unsigned long *)pte_quicklist;
-
-	if (ret != NULL) {
-		pte_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	}
-	return (pte_t *)ret;
-}
+        pte_t *pte = pte_alloc_one_kernel(mm, address);
 
+        if (pte)
+                return virt_to_page(pte);
 
-static inline pte_t*
-pte_alloc_one (struct mm_struct *mm, unsigned long addr)
-{
-	pte_t *pte = (pte_t *) __get_free_page(GFP_KERNEL);
-
-	if (pte != NULL)
-		clear_page(pte);
-	return pte;
+        return NULL;
 }
 
-static inline void
-pte_free (pte_t *pte)
-{
-	*(unsigned long *)pte = (unsigned long) pte_quicklist;
-	pte_quicklist = (unsigned long *) pte;
-	++pgtable_cache_size;
-}
+#define pte_free(pte_page)      pte_free_kernel(page_address(pte_page))
 
 extern int do_check_pgt_cache(int, int);
 
diff -Nru a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
--- a/include/asm-ppc64/pgtable.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-ppc64/pgtable.h	Thu Apr 17 15:25:14 2003
@@ -196,7 +196,8 @@
 #define	pmd_bad(pmd)		((pmd_val(pmd)) == 0)
 #define	pmd_present(pmd)	((pmd_val(pmd)) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
-#define pmd_page(pmd)		(__bpn_to_ba(pmd_val(pmd)))
+#define pmd_page_kernel(pmd)    (__bpn_to_ba(pmd_val(pmd)))
+#define pmd_page(pmd)		virt_to_page(pmd_page_kernel(pmd))
 #define pgd_set(pgdp, pmdp)	(pgd_val(*(pgdp)) = (__ba_to_bpn(pmdp)))
 #define pgd_none(pgd)		(!pgd_val(pgd))
 #define pgd_bad(pgd)		((pgd_val(pgd)) == 0)
@@ -217,8 +218,13 @@
   ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
 /* Find an entry in the third-level page table.. */
-#define pte_offset(dir,addr) \
-  ((pte_t *) pmd_page(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+#define pte_offset_kernel(dir,addr) \
+  ((pte_t *) pmd_page_kernel(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+
+#define pte_offset_map(dir,addr)        pte_offset_kernel((dir), (addr))
+#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr))
+#define pte_unmap(pte)                  do { } while(0)
+#define pte_unmap_nested(pte)           do { } while(0)
 
 /* to find an entry in a kernel page-table-directory */
 /* This now only contains the vmalloc pages */
@@ -399,6 +405,8 @@
 extern void build_valid_hpte(unsigned long vsid, unsigned long ea, 
 			     unsigned long pa, pte_t * ptep, 
 			     unsigned hpteflags, unsigned bolted );
+
+typedef pte_t *pte_addr_t;
 
 /* Encode and de-code a swap entry */
 #define SWP_TYPE(entry)			(((entry).val >> 1) & 0x3f)
diff -Nru a/include/asm-ppc64/rmap.h b/include/asm-ppc64/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-ppc64/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,9 @@
+#ifndef _PPC64_RMAP_H
+#define _PPC64_RMAP_H
+
+/* PPC calls pte_alloc() before mem_map[] is setup ... */
+#define BROKEN_PPC_PTE_ALLOC_ONE
+
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-s390/page.h b/include/asm-s390/page.h
--- a/include/asm-s390/page.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-s390/page.h	Thu Apr 17 15:25:14 2003
@@ -121,6 +121,9 @@
 #define PAGE_OFFSET             0x0UL
 #define __pa(x)                 (unsigned long)(x)
 #define __va(x)                 (void *)(x)
+#define pfn_to_page(pfn)	(mem_map + (pfn))
+#define page_to_pfn(page)	((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)		((pfn) < max_mapnr)
 #define virt_to_page(kaddr)	(mem_map + (__pa(kaddr) >> PAGE_SHIFT))
 #define VALID_PAGE(page)	((page - mem_map) < max_mapnr)
 
diff -Nru a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
--- a/include/asm-s390/pgalloc.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-s390/pgalloc.h	Thu Apr 17 15:25:14 2003
@@ -28,7 +28,7 @@
  * if any.
  */
 
-extern __inline__ pgd_t* get_pgd_slow(void)
+extern __inline__ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *ret;
         int i;
@@ -40,42 +40,11 @@
 	return ret;
 }
 
-extern __inline__ pgd_t* get_pgd_fast(void)
-{
-        unsigned long *ret = pgd_quicklist;
-	
-        if (ret != NULL) {
-                pgd_quicklist = (unsigned long *)(*ret);
-                ret[0] = ret[1];
-                pgtable_cache_size -= 2;
-        }
-        return (pgd_t *)ret;
-}
-
-extern __inline__ pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *pgd;
-
-	pgd = get_pgd_fast();
-	if (!pgd)
-		pgd = get_pgd_slow();
-	return pgd;
-}
-
-extern __inline__ void free_pgd_fast(pgd_t *pgd)
-{
-        *(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-        pgd_quicklist = (unsigned long *) pgd;
-        pgtable_cache_size += 2;
-}
-
-extern __inline__ void free_pgd_slow(pgd_t *pgd)
+extern __inline__ void pgd_free(pgd_t *pgd)
 {
         free_pages((unsigned long) pgd, 1);
 }
 
-#define pgd_free(pgd)           free_pgd_fast(pgd)
-
 /*
  * page middle directory allocation/free routines.
  * We don't use pmd cache, so these are dummy routines. This
@@ -88,7 +57,7 @@
 #define pmd_free_fast(x)                do { } while (0)
 #define pgd_populate(mm, pmd, pte)      BUG()
 
-extern inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
+extern inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
 {
 	pmd_val(pmd[0]) = _PAGE_TABLE + __pa(pte);
 	pmd_val(pmd[1]) = _PAGE_TABLE + __pa(pte+256);
@@ -96,14 +65,25 @@
 	pmd_val(pmd[3]) = _PAGE_TABLE + __pa(pte+768);
 }
 
+extern inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+	pte_t *p = page_address(pte);
+	if (p == NULL) BUG();
+	pmd_val(pmd[0]) = _PAGE_TABLE + __pa(p);
+	pmd_val(pmd[1]) = _PAGE_TABLE + __pa(p+256);
+	pmd_val(pmd[2]) = _PAGE_TABLE + __pa(p+512);
+	pmd_val(pmd[3]) = _PAGE_TABLE + __pa(p+768);
+}
+
 /*
  * page table entry allocation/free routines.
  */
-extern inline pte_t * pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
+extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr)
 {
 	pte_t *pte;
         int i;
 
+	/* XXX Riel retries this 10 times if get_free_page returns NULL */
 	pte = (pte_t *) __get_free_page(GFP_KERNEL);
 	if (pte != NULL) {
 		for (i=0; i < PTRS_PER_PTE; i++)
@@ -112,41 +92,49 @@
 	return pte;
 }
 
-extern __inline__ pte_t *
-pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
+extern inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long a)
 {
-        unsigned long *ret = (unsigned long *) pte_quicklist;
+	struct page *pte;
+	pte_t *p;
+	int i;
 
-        if (ret != NULL) {
-                pte_quicklist = (unsigned long *)(*ret);
-                ret[0] = ret[1];
-                pgtable_cache_size--;
-        }
-        return (pte_t *)ret;
+	/* XXX Riel retries this 10 times if alloc_pages returns NULL */
+	pte = alloc_pages(GFP_KERNEL, 0);
+	if (pte != NULL) {
+		/*
+		 * This is a pure cheating, using the fact that we
+		 * are not a highmem architecture, regardles of .config
+		 */
+		p = page_address(pte);
+		if (p == NULL) BUG();
+		for (i=0; i < PTRS_PER_PTE; i++)
+			pte_clear(p+i);
+	}
+	return pte;
 }
 
-extern __inline__ void pte_free_fast(pte_t *pte)
-{
-        *(unsigned long *)pte = (unsigned long) pte_quicklist;
-        pte_quicklist = (unsigned long *) pte;
-        pgtable_cache_size++;
-}
+#define pte_alloc_one_fast(mm, address)		(0)
 
-extern __inline__ void pte_free_slow(pte_t *pte)
+extern inline void pte_free_kernel(pte_t *pte)
 {
         free_page((unsigned long) pte);
 }
 
-#define pte_free(pte)           pte_free_fast(pte)
+extern inline void pte_free(struct page *pte)
+{
+	__free_page(pte);
+}
 
 extern int do_check_pgt_cache(int, int);
 
+#if 0 /* P3 */
 /*
  * This establishes kernel virtual mappings (e.g., as a result of a
  * vmalloc call).  Since s390-esame uses a separate kernel page table,
  * there is nothing to do here... :)
  */
 #define set_pgdir(addr,entry) do { } while(0)
+#endif
 
 /*
  * TLB flushing:
diff -Nru a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
--- a/include/asm-s390/pgtable.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-s390/pgtable.h	Thu Apr 17 15:25:14 2003
@@ -454,8 +454,8 @@
 
 #define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT)))
 
-#define pmd_page(pmd) \
-        ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page(x) (mem_map+(unsigned long)((pmd_val(x) >> PAGE_SHIFT)))
 
 /* to find an entry in a page-table-directory */
 #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
@@ -471,8 +471,17 @@
 }
 
 /* Find an entry in the third-level page table.. */
-#define pte_offset(pmd, address) \
-        ((pte_t *) (pmd_page(*pmd) + ((address>>10) & ((PTRS_PER_PTE-1)<<2))))
+#define __pte_offset(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE-1))
+#define pte_offset_kernel(pmd, address) \
+	((pte_t *) pmd_page_kernel(*pmd) + __pte_offset(address))
+
+#define pte_offset_map(dir, address) \
+	((pte_t *) page_address(pmd_page(*(dir))) + __pte_offset(address))
+#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { } while (0)
+
+typedef u32 pte_addr_t;
 
 /*
  * A page-table entry has some bits we have to treat in a special way.
diff -Nru a/include/asm-s390/rmap.h b/include/asm-s390/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-s390/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _S390_RMAP_H
+#define _S390_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-s390x/page.h b/include/asm-s390x/page.h
--- a/include/asm-s390x/page.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-s390x/page.h	Thu Apr 17 15:25:14 2003
@@ -95,22 +95,14 @@
         unsigned long pmd0;
         unsigned long pmd1; 
         } pmd_t;
-typedef unsigned int pgd_t;
+typedef struct { unsigned long pgd; } pgd_t;
 typedef struct { unsigned long pgprot; } pgprot_t;
 
 #define pte_val(x)      ((x).pte)
 #define pmd_val(x)      ((x).pmd0)
 #define pmd_val1(x)     ((x).pmd1)
-
-static inline unsigned long __pgd_val(pgd_t *pgdp)
-{
-	unsigned long addr = (unsigned long) pgdp;
-	unsigned long *pgd_slot = (unsigned long *) (addr & -8);
-
-	return *pgd_slot + ((addr & 4) << 11);
-}
-#define pgd_val(pgd) __pgd_val(&(pgd))
-
+#define __pgd_val(x)	((x)->pgd)	/* Violation in our linux/mm.h P3 */
+#define pgd_val(x)	((x).pgd)
 #define pgprot_val(x)   ((x).pgprot)
 
 #define __pte(x)        ((pte_t) { (x) } )
@@ -127,6 +119,9 @@
 #define PAGE_OFFSET             0x0UL
 #define __pa(x)                 (unsigned long)(x)
 #define __va(x)                 (void *)(x)
+#define pfn_to_page(pfn)	(mem_map + (pfn))
+#define page_to_pfn(page)	((page) - mem_map)
+#define pfn_valid(pfn)		((pfn) < max_mapnr)
 #define virt_to_page(kaddr)	(mem_map + (__pa(kaddr) >> PAGE_SHIFT))
 #define VALID_PAGE(page)	((page - mem_map) < max_mapnr)
 
diff -Nru a/include/asm-s390x/pgalloc.h b/include/asm-s390x/pgalloc.h
--- a/include/asm-s390x/pgalloc.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-s390x/pgalloc.h	Thu Apr 17 15:25:14 2003
@@ -18,9 +18,6 @@
 #include <linux/threads.h>
 #include <linux/slab.h>
 
-#define pgd_quicklist (S390_lowcore.cpu_data.pgd_quick)
-#define pmd_quicklist (S390_lowcore.cpu_data.pmd_quick)
-#define pte_quicklist (S390_lowcore.cpu_data.pte_quick)
 #define pgtable_cache_size (S390_lowcore.cpu_data.pgtable_cache_sz)
 
 /*
@@ -32,56 +29,28 @@
 /*
  * page directory allocation/free routines.
  */
-extern __inline__ pgd_t *get_pgd_slow (void)
+extern inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *ret;
-        int i;
+	int i;
 
-	ret = (pgd_t *) __get_free_pages(GFP_KERNEL, 1);
+	ret = (pgd_t *) __get_free_pages(GFP_KERNEL, 2);
 	if (ret != NULL)
-	        for (i = 0; i < PTRS_PER_PGD; i++) 
+		for (i = 0; i < PTRS_PER_PGD; i++) 
 	                pgd_clear(ret + i);
 	return ret;
 }
 
-extern __inline__ pgd_t *get_pgd_fast (void)
+extern inline void pgd_free(pgd_t *pgd)
 {
-	unsigned long *ret = pgd_quicklist;
-
-	if (ret != NULL) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = ret[1];
-		pgtable_cache_size -= 2;
-	}
-	return (pgd_t *) ret;
-}
-
-extern __inline__ pgd_t *pgd_alloc (struct mm_struct *mm)
-{
-	pgd_t *pgd;
-
-	pgd = get_pgd_fast();
-	if (!pgd)
-		pgd = get_pgd_slow();
-	return pgd;
+	free_pages((unsigned long) pgd, 2);
 }
 
-extern __inline__ void free_pgd_fast (pgd_t *pgd)
+extern inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd)
 {
-	*(unsigned long *) pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	pgtable_cache_size += 2;
+	pgd_val(*pgd) = _PGD_ENTRY | __pa(pmd);
 }
 
-extern __inline__ void free_pgd_slow (pgd_t *pgd)
-{
-	free_pages((unsigned long) pgd, 1);
-}
-
-#define pgd_free(pgd)		free_pgd_fast(pgd)
-
-extern pmd_t *pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd);
-
 /*
  * page middle directory allocation/free routines.
  */
@@ -90,7 +59,7 @@
 	pmd_t *pmd;
         int i;
 
-	pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 1);
+	pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 2);
 	if (pmd != NULL) {
 		for (i=0; i < PTRS_PER_PMD; i++)
 			pmd_clear(pmd+i);
@@ -98,51 +67,39 @@
 	return pmd;
 }
 
-extern __inline__ pmd_t *
-pmd_alloc_one_fast(struct mm_struct *mm, unsigned long address)
-{
-	unsigned long *ret = (unsigned long *) pmd_quicklist;
-
-	if (ret != NULL) {
-		pmd_quicklist = (unsigned long *)(*ret);
-		ret[0] = ret[1];
-		pgtable_cache_size -= 2;
-	}
-	return (pmd_t *) ret;
-}
+#define pmd_alloc_one_fast(mm, address) (0)
 
-extern void pmd_free_order2(pmd_t *);
-extern __inline__ void pmd_free_fast (pmd_t *pmd)
+extern inline void pmd_free(pmd_t *pmd)
 {
-	if (test_bit(PG_arch_1, &virt_to_page(pmd)->flags) == 0) {
-		*(unsigned long *) pmd = (unsigned long) pmd_quicklist;
-		pmd_quicklist = (unsigned long *) pmd;
-		pgtable_cache_size += 2;
-	} else
-		pmd_free_order2(pmd);
+	free_pages((unsigned long) pmd, 2);
 }
 
-extern __inline__ void pmd_free_slow (pmd_t *pmd)
+extern inline void
+pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
 {
-	free_pages((unsigned long) pmd, 1);
+	pmd_val(*pmd) = _PMD_ENTRY | __pa(pte);
+	pmd_val1(*pmd) = _PMD_ENTRY | __pa(pte+256);
 }
 
-#define pmd_free(pmd)		pmd_free_fast(pmd)
-
-extern inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
+extern inline void
+pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
 {
-	pmd_val(*pmd) = _PMD_ENTRY | __pa(pte);
-	pmd_val1(*pmd) = _PMD_ENTRY | __pa(pte+256);
+	pte_t *p = page_address(pte);
+	if (p == NULL) BUG();
+	pmd_val(*pmd) = _PMD_ENTRY | __pa(p);
+	pmd_val1(*pmd) = _PMD_ENTRY | __pa(p+256);
 }
 
 /*
  * page table entry allocation/free routines.
  */
-extern inline pte_t * pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
+extern inline pte_t *
+pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr)
 {
 	pte_t *pte;
         int i;
 
+	/* XXX Riel retries this 10 times if get_free_page returns NULL */
 	pte = (pte_t *) __get_free_page(GFP_KERNEL);
 	if (pte != NULL) {
 		for (i=0; i < PTRS_PER_PTE; i++)
@@ -151,40 +108,34 @@
 	return pte;
 }
 
-extern __inline__ pte_t* pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
+extern inline struct page *
+pte_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-        unsigned long *ret = (unsigned long *) pte_quicklist;
-
-        if (ret != NULL) {
-                pte_quicklist = (unsigned long *)(*ret);
-                ret[0] = ret[1];
-                pgtable_cache_size--;
-        }
-        return (pte_t *)ret;
+	return virt_to_page(pte_alloc_one_kernel(mm, addr));
 }
 
-extern __inline__ void pte_free_fast (pte_t *pte)
-{
-	*(unsigned long *) pte = (unsigned long) pte_quicklist;
-	pte_quicklist = (unsigned long *) pte;
-	pgtable_cache_size++;
-}
+#define pte_alloc_one_fast(mm, address) (0)
 
-extern __inline__ void pte_free_slow (pte_t *pte)
+extern inline void pte_free_kernel(pte_t *pte)
 {
         free_page((unsigned long) pte);
 }
 
-#define pte_free(pte)		pte_free_fast(pte)
+extern inline void pte_free(struct page *pte)
+{
+	__free_page(pte);
+}
 
-extern int do_check_pgt_cache (int, int);
+#define do_check_pgt_cache(x, y)  (0)	/* No quicklists in rmap P3 */
 
+#if 0 /* P3 */
 /*
  * This establishes kernel virtual mappings (e.g., as a result of a
  * vmalloc call).  Since s390-esame uses a separate kernel page table,
  * there is nothing to do here... :)
  */
 #define set_pgdir(vmaddr, entry)	do { } while(0)
+#endif
 
 /*
  * TLB flushing:
diff -Nru a/include/asm-s390x/pgtable.h b/include/asm-s390x/pgtable.h
--- a/include/asm-s390x/pgtable.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-s390x/pgtable.h	Thu Apr 17 15:25:14 2003
@@ -63,7 +63,7 @@
 #define PMD_MASK        (~(PMD_SIZE-1))
 
 /* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT     30
+#define PGDIR_SHIFT     31
 #define PGDIR_SIZE      (1UL << PGDIR_SHIFT)
 #define PGDIR_MASK      (~(PGDIR_SIZE-1))
 
@@ -72,7 +72,7 @@
  * currently we use a 3 level lookup
  */
 #define PTRS_PER_PTE    512
-#define PTRS_PER_PMD    512
+#define PTRS_PER_PMD    1024
 #define PTRS_PER_PGD    2048
 
 /*
@@ -169,15 +169,13 @@
 
 /* Bits in the region third table entry */
 #define _PGD_ENTRY_INV	0x20		/* region table entry invalid bit  */
-#define _PGD_ENTRY_MASK 0x04		/* region third table entry mask   */
-#define _PGD_ENTRY_LEN(x) ((x)&3)       /* region table length bits        */
-#define _PGD_ENTRY_OFF(x) (((x)&3)<<6)  /* region table offset bits        */
+#define _PGD_ENTRY      0x07
 
 /*
  * User and kernel page directory
  */
 #define _REGION_THIRD       0x4
-#define _REGION_THIRD_LEN   0x1 
+#define _REGION_THIRD_LEN   0x3 
 #define _REGION_TABLE       (_REGION_THIRD|_REGION_THIRD_LEN|0x40|0x100)
 #define _KERN_REGION_TABLE  (_REGION_THIRD|_REGION_THIRD_LEN)
 
@@ -254,37 +252,20 @@
 /*
  * pgd/pmd/pte query functions
  */
-extern inline int __pgd_present(pgd_t *pgd)
+extern inline int pgd_present(pgd_t pgd)
 {
-	unsigned long addr = (unsigned long) pgd;
-	unsigned long *pgd_slot = (unsigned long *) (addr & -8);
-	unsigned long offset = (addr & 4) >> 1;
-
-	if (*pgd_slot & _PGD_ENTRY_INV)
-		return 0;
-	if ((*pgd_slot & _PGD_ENTRY_OFF(3)) > _PGD_ENTRY_OFF(offset))
-		return 0;
-	if ((*pgd_slot & _PGD_ENTRY_LEN(3)) < _PGD_ENTRY_LEN(offset))
-		return 0;
-	return 1;
+	return (pgd_val(pgd) & ~PAGE_MASK) == _PGD_ENTRY;
 }
-#define pgd_present(pgd) __pgd_present(&(pgd))
 
-extern inline int __pgd_none(pgd_t *pgd)
+extern inline int pgd_none(pgd_t pgd)
 {
-	return !__pgd_present(pgd);
+	return pgd_val(pgd) & _PGD_ENTRY_INV;
 }
-#define pgd_none(pgd) __pgd_none(&(pgd))
 
-extern inline int __pgd_bad(pgd_t *pgd)
+extern inline int pgd_bad(pgd_t pgd)
 {
-	unsigned long addr = (unsigned long) pgd;
-	unsigned long *pgd_slot = (unsigned long *) (addr & -8);
-
-	return (*pgd_slot & (~PAGE_MASK & ~_PGD_ENTRY_INV & ~_PGD_ENTRY_MASK &
-		             ~_PGD_ENTRY_LEN(3) & ~_PGD_ENTRY_OFF(3))) != 0;
+	return (pgd_val(pgd) & (~PAGE_MASK & ~_PGD_ENTRY_INV)) != _PGD_ENTRY;
 }
-#define pgd_bad(pgd) __pgd_bad(&(pgd))
 
 extern inline int pmd_present(pmd_t pmd)
 {
@@ -346,27 +327,7 @@
  */
 extern inline void pgd_clear(pgd_t * pgdp)
 {
-	unsigned long addr = (unsigned long) pgdp;
-	unsigned long *pgd_slot = (unsigned long *) (addr & -8);
-	unsigned long offset = addr & 4;
-
-	if (*pgd_slot & _PGD_ENTRY_INV) {
-		*pgd_slot = _PGD_ENTRY_INV;
-		return;
-	}
-	if (offset == 0 && (*pgd_slot & _PGD_ENTRY_LEN(2)) != 0) {
-		/* Clear lower pmd, upper pmd still used. */
-		*pgd_slot = (*pgd_slot & PAGE_MASK) | _PGD_ENTRY_MASK |
-			    _PGD_ENTRY_OFF(2) | _PGD_ENTRY_LEN(3);
-		return;
-	}
-	if (offset == 4 && (*pgd_slot & _PGD_ENTRY_OFF(2)) == 0) {
-		/* Clear upped pmd, lower pmd still used. */
-		*pgd_slot = (*pgd_slot & PAGE_MASK) | _PGD_ENTRY_MASK |
-			    _PGD_ENTRY_OFF(0) | _PGD_ENTRY_LEN(1);
-		return;
-	}
-	*pgd_slot = _PGD_ENTRY_INV;
+	pgd_val(*pgdp) = _PGD_ENTRY_INV | _PGD_ENTRY;
 }
 
 extern inline void pmd_clear(pmd_t * pmdp)
@@ -512,8 +473,9 @@
 
 #define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT)))
 
-#define pmd_page(pmd) \
+#define pmd_page_kernel(pmd) \
         ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page(x) (mem_map+(unsigned long)((pmd_val(x) >> PAGE_SHIFT)))
 
 /* to find an entry in a page-table-directory */
 #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
@@ -530,8 +492,17 @@
 	((pmd_t *) pgd_page(dir) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
 /* Find an entry in the third-level page table.. */
-#define pte_offset(dir,addr) \
-	((pte_t *) pmd_page(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+#define __pte_offset(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE-1))
+#define pte_offset_kernel(pmd, address) \
+	((pte_t *) pmd_page_kernel(*pmd) + __pte_offset(address))
+
+#define pte_offset_map(dir, address) \
+	((pte_t *) page_address(pmd_page(*(dir))) + __pte_offset(address))
+#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { } while (0)
+
+typedef u64 pte_addr_t;
 
 /*
  * A page-table entry has some bits we have to treat in a special way.
diff -Nru a/include/asm-s390x/rmap.h b/include/asm-s390x/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-s390x/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _S390X_RMAP_H
+#define _S390X_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sh/rmap.h b/include/asm-sh/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-sh/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _SH_RMAP_H
+#define _SH_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sparc/rmap.h b/include/asm-sparc/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-sparc/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _SPARC_RMAP_H
+#define _SPARC_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sparc64/rmap.h b/include/asm-sparc64/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-sparc64/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _SPARC64_RMAP_H
+#define _SPARC64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
--- a/include/asm-x86_64/io.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-x86_64/io.h	Thu Apr 17 15:25:14 2003
@@ -137,15 +137,6 @@
 	return __va(address);
 }
 
-/*
- * Change "struct page" to physical address.
- */
-#ifdef CONFIG_DISCONTIGMEM
-#include <asm/mmzone.h>
-#else
-#define page_to_phys(page)	(((page) - mem_map) << PAGE_SHIFT)
-#endif
-
 extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
 
 extern inline void * ioremap (unsigned long offset, unsigned long size)
diff -Nru a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
--- a/include/asm-x86_64/page.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-x86_64/page.h	Thu Apr 17 15:25:14 2003
@@ -114,31 +114,31 @@
 /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
    Otherwise you risk miscompilation. */ 
 #define __pa(x)			(((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
-/* __pa_symbol should use for C visible symbols, but only for them. 
+/* __pa_symbol should be used for C visible symbols.
    This seems to be the official gcc blessed way to do such arithmetic. */ 
 #define __pa_symbol(x)		\
 	({unsigned long v;  \
 	  asm("" : "=r" (v) : "0" (x)); \
-	 v - __START_KERNEL_map; })
-#define __pa_maybe_symbol(x)		\
-	({unsigned long v;  \
-	  asm("" : "=r" (v) : "0" (x)); \
 	  __pa(v); })
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 #ifndef CONFIG_DISCONTIGMEM
-#define virt_to_page(kaddr)	(mem_map + (__pa(kaddr) >> PAGE_SHIFT))
-#define pfn_to_page(pfn)	(mem_map + (pfn)) 
-#define page_to_pfn(page)   ((page) - mem_map)
 #define page_to_phys(page)	(((page) - mem_map) << PAGE_SHIFT)
+#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_page(pfn)	(mem_map + (pfn))
+#define page_to_pfn(page)	((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)		((pfn) < max_mapnr)
 #define VALID_PAGE(page)	(((page) - mem_map) < max_mapnr)
+#define pfn_to_phys(pfn)	((unsigned long)(pfn) << PAGE_SHIFT)
+#else
+#include <asm/mmzone.h>
 #endif
 
-#define phys_to_pfn(phys)	((phys) >> PAGE_SHIFT)
-
+#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
 
-#define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | VM_EXEC | \
-				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS  (VM_READ | VM_WRITE | VM_EXEC | \
+                                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #endif /* __KERNEL__ */
 
diff -Nru a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
--- a/include/asm-x86_64/pgalloc.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-x86_64/pgalloc.h	Thu Apr 17 15:25:14 2003
@@ -9,137 +9,79 @@
 #include <linux/mm.h>
 #include <asm/page.h>
 
-#define inc_pgcache_size() add_pda(pgtable_cache_sz,1UL)
-#define dec_pgcache_size() sub_pda(pgtable_cache_sz,1UL)
-
-#define pmd_populate(mm, pmd, pte) \
+#define pmd_populate_kernel(mm, pmd, pte) \
 		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
 #define pgd_populate(mm, pgd, pmd) \
 		set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pmd)))
 
-extern __inline__ pmd_t *get_pmd_slow(void)
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
 {
-	return (pmd_t *)get_zeroed_page(GFP_KERNEL);
+	set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
 }
 
-extern __inline__ pmd_t *get_pmd_fast(void)
+extern __inline__ pmd_t *get_pmd(void)
 {
-	unsigned long *ret;
-
-	if ((ret = read_pda(pmd_quick)) != NULL) {
-		write_pda(pmd_quick, (unsigned long *)(*ret));
-		ret[0] = 0;
-		dec_pgcache_size();
-	} else
-		ret = (unsigned long *)get_pmd_slow();
-	return (pmd_t *)ret;
+	return (pmd_t *)get_zeroed_page(GFP_KERNEL);
 }
 
 extern __inline__ void pmd_free(pmd_t *pmd)
 {
-	*(unsigned long *)pmd = (unsigned long) read_pda(pmd_quick);
-	write_pda(pmd_quick,(unsigned long *) pmd);
-	inc_pgcache_size();
-}
-
-extern __inline__ void pmd_free_slow(pmd_t *pmd)
-{
 	if ((unsigned long)pmd & (PAGE_SIZE-1)) 
-		out_of_line_bug(); 
+		BUG(); 
 	free_page((unsigned long)pmd);
 }
 
-static inline pmd_t *pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
-{
-	unsigned long *ret = (unsigned long *)read_pda(pmd_quick);
-
-	if (ret != NULL) {
-		write_pda(pmd_quick, (unsigned long *)(*ret));
-		ret[0] = 0;
-		dec_pgcache_size();
-	}
-	return (pmd_t *)ret;
-}
-
 static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	return (pmd_t *)get_zeroed_page(GFP_KERNEL); 
-}
-
-static inline pgd_t *pgd_alloc_one_fast (void)
-{
-	unsigned long *ret = read_pda(pgd_quick);
-
-	if (ret) {
-		write_pda(pgd_quick,(unsigned long *)(*ret));
-		ret[0] = 0;
-		dec_pgcache_size();
-	}
-	return (pgd_t *) ret;
+	return (pmd_t *) get_zeroed_page(GFP_KERNEL); 
 }
 
 static inline pgd_t *pgd_alloc (struct mm_struct *mm)
 {
-	/* the VM system never calls pgd_alloc_one_fast(), so we do it here. */
-	pgd_t *pgd = pgd_alloc_one_fast();
-
-	if (pgd == NULL)
-		pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); 
-	return pgd;
+	return (pgd_t *)get_zeroed_page(GFP_KERNEL);
 }
 
 static inline void pgd_free (pgd_t *pgd)
 {
-	*(unsigned long *)pgd = (unsigned long) read_pda(pgd_quick);
-	write_pda(pgd_quick,(unsigned long *) pgd);
-	inc_pgcache_size();
-}
-
-
-static inline void pgd_free_slow (pgd_t *pgd)
-{
 	if ((unsigned long)pgd & (PAGE_SIZE-1)) 
-		out_of_line_bug(); 
+		BUG(); 
 	free_page((unsigned long)pgd);
 }
 
-
-static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return (pte_t *)get_zeroed_page(GFP_KERNEL); 
+	return (pte_t *) get_zeroed_page(GFP_KERNEL);
 }
 
-extern __inline__ pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
-{
-	unsigned long *ret;
+#define pte_alloc_one_fast(x,y)	(0)
+#define pmd_alloc_one_fast(x,y)	(0)
+#define do_check_pgt_cache(x,y)	(0)
 
-	if ((ret = read_pda(pte_quick)) != NULL) {  
-		write_pda(pte_quick, (unsigned long *)(*ret));
-		ret[0] = ret[1];
-		dec_pgcache_size();
-	}
-	return (pte_t *)ret;
+static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	void *p = (void *)get_zeroed_page(GFP_KERNEL); 
+	if (!p)
+		return NULL;
+	return virt_to_page(p);
 }
 
-/* Should really implement gc for free page table pages. This could be done with 
-   a reference count in struct page. */
+/* Should really implement gc for free page table pages. This could be
+   done with a reference count in struct page. */
 
-extern __inline__ void pte_free(pte_t *pte)
-{	
-	*(unsigned long *)pte = (unsigned long) read_pda(pte_quick);
-	write_pda(pte_quick, (unsigned long *) pte); 
-	inc_pgcache_size();
-}
-
-extern __inline__ void pte_free_slow(pte_t *pte)
+extern __inline__ void pte_free_kernel(pte_t *pte)
 {
 	if ((unsigned long)pte & (PAGE_SIZE-1))
-		out_of_line_bug();
+		BUG();
 	free_page((unsigned long)pte); 
 }
 
+extern inline void pte_free(struct page *pte)
+{
+	__free_page(pte);
+} 
 
-extern int do_check_pgt_cache(int, int);
+#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+#define __pmd_free_tlb(tlb,x)   pmd_free(x)
 
 /*
  * TLB flushing:
diff -Nru a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
--- a/include/asm-x86_64/pgtable.h	Thu Apr 17 15:25:14 2003
+++ b/include/asm-x86_64/pgtable.h	Thu Apr 17 15:25:14 2003
@@ -18,6 +18,7 @@
 #include <asm/pda.h>
 #include <linux/threads.h>
 #include <linux/config.h>
+#include <asm/page.h>
 
 extern pgd_t level3_kernel_pgt[512];
 extern pgd_t level3_physmem_pgt[512];
@@ -373,7 +374,7 @@
 }
 
 #define page_pte(page) page_pte_prot(page, __pgprot(0))
-#define __pmd_page(pmd) (__va(pmd_val(pmd) & PHYSICAL_PAGE_MASK))
+#define pmd_page_kernel(pmd) (__va(pmd_val(pmd) & PHYSICAL_PAGE_MASK))
 
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
@@ -396,9 +397,17 @@
 /* Find an entry in the third-level page table.. */
 #define __pte_offset(address) \
 		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset(dir, address) ((pte_t *) __pmd_page(*(dir)) + \
+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
 			__pte_offset(address))
 
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel(dir,address)
+#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
+#define pte_unmap(pte) /* NOP */
+#define pte_unmap_nested(pte) /* NOP */
+
+#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+
 /* never use these in the common code */
 #define pml4_page(level4) ((unsigned long) __va(pml4_val(level4) & PHYSICAL_PAGE_MASK))
 #define pml4_index(address) (((address) >> PML4_SHIFT) & (PTRS_PER_PML4-1))
@@ -419,6 +428,8 @@
 #define SWP_ENTRY(type, offset)		((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
 #define pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define swp_entry_to_pte(x)		((pte_t) { (x).val })
+
+typedef pte_t *pte_addr_t;
 
 struct page;
 /* 
diff -Nru a/include/asm-x86_64/rmap.h b/include/asm-x86_64/rmap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-x86_64/rmap.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,7 @@
+#ifndef _X64_64_RMAP_H
+#define _X86_64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/linux/brlock.h b/include/linux/brlock.h
--- a/include/linux/brlock.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/brlock.h	Thu Apr 17 15:25:14 2003
@@ -34,6 +34,7 @@
 enum brlock_indices {
 	BR_GLOBALIRQ_LOCK,
 	BR_NETPROTO_LOCK,
+	BR_LRU_LOCK,
 
 	__BR_END
 };
diff -Nru a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/fs.h	Thu Apr 17 15:25:14 2003
@@ -268,6 +268,7 @@
 	wait_queue_head_t b_wait;
 
 	struct list_head     b_inode_buffers;	/* doubly linked list of inode dirty buffers */
+	struct list_head lru;		/* Reclaim used buffers easily. */
 };
 
 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
@@ -1374,6 +1375,8 @@
 
 extern void inode_init_once(struct inode *);
 extern void iput(struct inode *);
+extern void refile_inode(struct inode *inode);
+
 extern void force_delete(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
@@ -1436,6 +1439,7 @@
 
 /* Generic buffer handling for block filesystems.. */
 extern int try_to_release_page(struct page * page, int gfp_mask);
+extern int try_to_reclaim_buffers(int, unsigned int);
 extern int discard_bh_page(struct page *, unsigned long, int);
 #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
 #define block_invalidate_page(page) discard_bh_page(page, 0, 0)
diff -Nru a/include/linux/highmem.h b/include/linux/highmem.h
--- a/include/linux/highmem.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/highmem.h	Thu Apr 17 15:25:14 2003
@@ -71,6 +71,7 @@
 
 #define kmap_atomic(page,idx)		kmap(page)
 #define kunmap_atomic(page,idx)		kunmap(page)
+#define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
 #define bh_kmap(bh)			((bh)->b_data)
 #define bh_kunmap(bh)			do { } while (0)
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/mm.h	Thu Apr 17 15:25:14 2003
@@ -1,5 +1,23 @@
 #ifndef _LINUX_MM_H
 #define _LINUX_MM_H
+/*
+ * Copyright (c) 2002. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: 
+ *	Linus Torvalds
+ *	Stephen Tweedie
+ *	Andrea Arcangeli
+ *	Rik van Riel
+ *	Arjan van de Ven
+ *	and others
+ */
 
 #include <linux/sched.h>
 #include <linux/errno.h>
@@ -18,9 +36,6 @@
 extern unsigned long num_mappedpages;
 extern void * high_memory;
 extern int page_cluster;
-/* The inactive_clean lists are per zone. */
-extern struct list_head active_list;
-extern struct list_head inactive_list;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -134,6 +149,9 @@
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
 };
 
+/* forward declaration; pte_chain is meant to be internal to rmap.c */
+struct pte_chain;
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -151,7 +169,11 @@
  */
 typedef struct page {
 	struct list_head list;		/* ->mapping has some page lists. */
-	struct address_space *mapping;	/* The inode (or ...) we belong to. */
+	struct address_space *mapping;	/* The inode (or ...) we belong to.
+					 * protected by PG_locked and the
+					 * pagecache_lock. Hold one to read,
+					 * both to write.
+					 */
 	unsigned long index;		/* Our offset within mapping. */
 	struct page *next_hash;		/* Next page sharing our hash bucket in
 					   the pagecache hash table. */
@@ -159,7 +181,13 @@
 	unsigned long flags;		/* atomic flags, some possibly
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
-					   protected by pagemap_lru_lock !! */
+					   protected by the lru lock !! */
+	union {
+		struct pte_chain *chain;/* Reverse pte mapping pointer.
+					 * protected by PG_chainlock */
+		pte_addr_t direct;
+	} pte;
+	unsigned char age;		/* Page aging counter. */
 	struct page **pprev_hash;	/* Complement to *next_hash. */
 	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
 
@@ -266,7 +294,7 @@
  *
  * Note that the referenced bit, the page->lru list_head and the
  * active, inactive_dirty and inactive_clean lists are protected by
- * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
+ * the lru lock, and *NOT* by the usual PG_locked bit!
  *
  * PG_skip is used on sparc/sparc64 architectures to "skip" certain
  * parts of the address space.
@@ -287,17 +315,22 @@
 #define PG_referenced		 2
 #define PG_uptodate		 3
 #define PG_dirty		 4
-#define PG_unused		 5
-#define PG_lru			 6
-#define PG_active		 7
-#define PG_slab			 8
-#define PG_skip			10
-#define PG_highmem		11
-#define PG_checked		12	/* kill me in 2.5.<early>. */
-#define PG_arch_1		13
-#define PG_reserved		14
-#define PG_launder		15	/* written out by VM pressure.. */
-#define PG_fs_1			16	/* Filesystem specific */
+#define PG_active_anon		 5
+#define PG_direct		 6
+#define PG_inactive_dirty	 7
+#define PG_inactive_laundry	 8
+#define PG_inactive_clean	 9
+#define PG_slab			10
+#define PG_skip			11
+#define PG_highmem		12
+#define PG_checked		13	/* kill me in 2.5.<early>. */
+#define PG_arch_1		14
+#define PG_reserved		15
+#define PG_launder		16	/* written out by VM pressure.. */
+#define PG_chainlock		17	/* lock bit for ->pte_chain */
+#define PG_lru			18
+#define PG_active_cache		19
+#define PG_fs_1			20
 
 /* Make it prettier to test the above... */
 #define UnlockPage(page)	unlock_page(page)
@@ -317,6 +350,49 @@
 #define ClearPageLaunder(page)	clear_bit(PG_launder, &(page)->flags)
 
 /*
+ * inlines for acquisition and release of PG_chainlock
+ */
+static inline void pte_chain_lock(struct page *page)
+{
+	/*
+	 * The preempt patch seems to be popular enough to
+	 * warrant this little hack...
+	 */
+#ifdef CONFIG_PREEMPT
+	preempt_disable();
+#endif
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+#ifdef CONFIG_SMP
+	while (test_and_set_bit(PG_chainlock, &page->flags)) {
+		while (test_bit(PG_chainlock, &page->flags)) {
+			barrier();
+			cpu_relax();
+		}
+	}
+#endif
+}
+
+static inline void pte_chain_unlock(struct page *page)
+{
+#ifdef CONFIG_SMP
+	clear_bit(PG_chainlock, &page->flags);
+#endif
+	/*
+	 * The preempt patch seems to be popular enough to
+	 * warrant this little hack...
+	 */
+#ifdef CONFIG_PREEMPT
+	preempt_enable();
+#endif
+}
+
+/*
  * The zone field is never updated after free_area_init_core()
  * sets it, so none of the operations on it need to be atomic.
  */
@@ -378,6 +454,9 @@
  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
  * parallel wait_on_page).
  */
+#define PageDirect(page)	test_bit(PG_direct, &(page)->flags)
+#define SetPageDirect(page)	set_bit(PG_direct, &(page)->flags)
+#define ClearPageDirect(page)	clear_bit(PG_direct, &(page)->flags)
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
@@ -390,13 +469,34 @@
 #define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
 #define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
 
-#define PageActive(page)	test_bit(PG_active, &(page)->flags)
-#define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
-#define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
-
-#define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
+#define PageActiveAnon(page)		test_bit(PG_active_anon, &(page)->flags)
+#define SetPageActiveAnon(page)	set_bit(PG_active_anon, &(page)->flags)
+#define ClearPageActiveAnon(page)	clear_bit(PG_active_anon, &(page)->flags)
+#define TestandSetPageActiveAnon(page)	test_and_set_bit(PG_active_anon, &(page)->flags)
+#define TestandClearPageActiveAnon(page)	test_and_clear_bit(PG_active_anon, &(page)->flags)
+
+#define PageActiveCache(page)		test_bit(PG_active_cache, &(page)->flags)
+#define SetPageActiveCache(page)	set_bit(PG_active_cache, &(page)->flags)
+#define ClearPageActiveCache(page)	clear_bit(PG_active_cache, &(page)->flags)
+#define TestandSetPageActiveCache(page)	test_and_set_bit(PG_active_cache, &(page)->flags)
+#define TestandClearPageActiveCache(page)	test_and_clear_bit(PG_active_cache, &(page)->flags)
+
+#define PageInactiveLaundry(page)	test_bit(PG_inactive_laundry, &(page)->flags)
+#define SetPageInactiveLaundry(page)	set_bit(PG_inactive_laundry, &(page)->flags)
+#define ClearPageInactiveLaundry(page)	clear_bit(PG_inactive_laundry, &(page)->flags)
+
+#define PageInactiveDirty(page)	test_bit(PG_inactive_dirty, &(page)->flags)
+#define SetPageInactiveDirty(page)	set_bit(PG_inactive_dirty, &(page)->flags)
+#define ClearPageInactiveDirty(page)	clear_bit(PG_inactive_dirty, &(page)->flags)
+
+#define PageInactiveClean(page)	test_bit(PG_inactive_clean, &(page)->flags)
+#define SetPageInactiveClean(page)	set_bit(PG_inactive_clean, &(page)->flags)
+#define ClearPageInactiveClean(page)	clear_bit(PG_inactive_clean, &(page)->flags)
+
+#define PageLRU(page)	 	test_bit(PG_lru, &(page)->flags)
+#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
+#define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
+#define TestandSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
 
 #ifdef CONFIG_HIGHMEM
 #define PageHighMem(page)		test_bit(PG_highmem, &(page)->flags)
@@ -408,6 +508,16 @@
 #define ClearPageReserved(page)		clear_bit(PG_reserved, &(page)->flags)
 
 /*
+ * Return true if this page is mapped into pagetables.  Subtle: test pte.direct
+ * rather than pte.chain.  Because sometimes pte.direct is 64-bit, and .chain
+ * is only 32-bit.
+ */
+static inline int page_mapped(struct page *page)
+{
+	return page->pte.direct != 0;
+}
+
+/*
  * Error return values for the *_nopage functions
  */
 #define NOPAGE_SIGBUS	(NULL)
@@ -461,6 +571,7 @@
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
 
+extern void FASTCALL(fixup_freespace(struct zone_struct *, int));
 extern void show_free_areas(void);
 extern void show_free_areas_node(pg_data_t *pgdat);
 
@@ -479,7 +590,8 @@
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
diff -Nru a/include/linux/mm_inline.h b/include/linux/mm_inline.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/linux/mm_inline.h	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,424 @@
+#ifndef _LINUX_MM_INLINE_H
+#define _LINUX_MM_INLINE_H
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/brlock.h>
+
+
+/*
+ * Copyright (c) 2002. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: 
+ *	Linus Torvalds
+ *	Stephen Tweedie
+ *	Andrea Arcangeli
+ *	Rik van Riel
+ *	Arjan van de Ven
+ *	and others
+ */
+
+GPL_HEADER()
+
+/*
+ * These inline functions tend to need bits and pieces of all the
+ * other VM include files, meaning they cannot be defined inside
+ * one of the other VM include files.
+ * 
+ */
+ 
+/**
+ * page_dirty - do we need to write the data out to disk
+ * @page: page to test
+ *
+ * Returns true if the page contains data which needs to
+ * be written to disk.  Doesn't test the page tables (yet?).
+ */
+static inline int page_dirty(struct page *page)
+{
+	struct buffer_head *tmp, *bh;
+
+	if (PageDirty(page))
+		return 1;
+
+	if (page->mapping && !page->buffers)
+		return 0;
+
+	tmp = bh = page->buffers;
+
+	do {
+		if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
+			return 1;
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+
+	return 0;
+}
+
+/**
+ * page_anon - is this page ram/swap backed ?
+ * @page - page to test
+ *
+ * Returns 1 if the page is backed by ram/swap, 0 if the page is
+ * backed by a file in a filesystem on permanent storage.
+ */
+static inline int page_anon(struct page * page)
+{
+	/* Pages of an mmap()d file won't trigger this unless they get
+	 * referenced on the inactive list and really are in the working
+	 * set of the process... */
+	if (page->pte.direct)
+		return 1;
+
+	if (!page->mapping && !page->buffers)
+		return 1;
+
+	if (PageSwapCache(page))
+		return 1;
+
+	/* TODO: ramfs, tmpfs shm segments and ramdisk */
+
+	return 0;
+}
+
+
+
+static inline void add_page_to_active_anon_list(struct page * page, int age)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageActiveAnon(page);
+	list_add(&page->lru, &zone->active_anon_list[age]);
+	page->age = age + zone->anon_age_bias;
+	zone->active_anon_count[age]++;
+	zone->active_anon_pages++;
+}
+
+static inline void add_page_to_active_cache_list(struct page * page, int age)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageActiveCache(page);
+	list_add(&page->lru, &zone->active_cache_list[age]);
+	page->age = age + zone->cache_age_bias;
+	zone->active_cache_count[age]++;
+	zone->active_cache_pages++;
+}
+
+static inline void add_page_to_active_list(struct page * page, int age)
+{
+	if (page_anon(page))
+		add_page_to_active_anon_list(page, age);
+	else
+		add_page_to_active_cache_list(page, age);
+}
+
+static inline void add_page_to_inactive_dirty_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveDirty(page);
+	list_add(&page->lru, &zone->inactive_dirty_list);
+	zone->inactive_dirty_pages++;
+}
+
+static inline void add_page_to_inactive_laundry_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveLaundry(page);
+	list_add(&page->lru, &zone->inactive_laundry_list);
+	zone->inactive_laundry_pages++;
+}
+
+static inline void add_page_to_inactive_clean_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveClean(page);
+	list_add(&page->lru, &zone->inactive_clean_list);
+	zone->inactive_clean_pages++;
+}
+
+static inline void del_page_from_active_anon_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	unsigned char age;
+	list_del(&page->lru);
+	ClearPageActiveAnon(page);
+	zone->active_anon_pages--;
+	age = page->age - zone->anon_age_bias;
+	if (age<=MAX_AGE)
+		zone->active_anon_count[age]--;
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_active_cache_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	unsigned char age;
+	list_del(&page->lru);
+	ClearPageActiveCache(page);
+	zone->active_cache_pages--;
+	age = page->age - zone->cache_age_bias;
+	if (age<=MAX_AGE)
+		zone->active_cache_count[age]--;
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_dirty_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveDirty(page);
+	zone->inactive_dirty_pages--;
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_laundry_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveLaundry(page);
+	zone->inactive_laundry_pages--;
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_clean_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveClean(page);
+	zone->inactive_clean_pages--;
+	DEBUG_LRU_PAGE(page);
+}
+
+/*
+ * Inline functions to control some balancing in the VM.
+ *
+ * Note that we do both global and per-zone balancing, with
+ * most of the balancing done globally.
+ */
+#define	PLENTY_FACTOR	2
+#define	ALL_ZONES	NULL
+#define	ANY_ZONE	(struct zone_struct *)(~0UL)
+#define INACTIVE_FACTOR	5
+
+#define	VM_MIN	0
+#define	VM_LOW	1
+#define	VM_HIGH	2
+#define VM_PLENTY 3
+static inline int zone_free_limit(struct zone_struct * zone, int limit)
+{
+	int free, target, delta;
+
+	/* This is really nasty, but GCC should completely optimise it away. */
+	if (limit == VM_MIN)
+		target = zone->pages_min;
+	else if (limit == VM_LOW)
+		target = zone->pages_low;
+	else if (limit == VM_HIGH)
+		target = zone->pages_high;
+	else
+		target = zone->pages_high * PLENTY_FACTOR;
+
+	free = zone->free_pages + zone->inactive_clean_pages;
+	delta = target - free;
+
+	return delta;
+}
+
+static inline int free_limit(struct zone_struct * zone, int limit)
+{
+	int shortage = 0, local;
+
+	if (zone == ALL_ZONES) {
+		for_each_zone(zone)
+			shortage += zone_free_limit(zone, limit);
+	} else if (zone == ANY_ZONE) {
+		for_each_zone(zone) {
+			local = zone_free_limit(zone, limit);
+			shortage += max(local, 0);
+		}
+	} else {
+		shortage = zone_free_limit(zone, limit);
+	}
+
+	return shortage;
+}
+
+/**
+ * free_min - test for critically low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a serious shortage of free and
+ * clean pages, zero or negative if there is no serious shortage.
+ */
+static inline int free_min(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_MIN);
+}
+
+/**
+ * free_low - test for low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a shortage of free and
+ * clean pages, zero or negative if there is no shortage.
+ */
+static inline int free_low(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_LOW);
+}
+
+/**
+ * free_high - test if amount of free pages is less than ideal
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free and clean
+ * pages is below kswapd's target, zero or negative if we
+ * have more than enough free and clean pages.
+ */
+static inline int free_high(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_HIGH);
+}
+
+/**
+ * free_plenty - test if enough pages are freed
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free + clean pages
+ * in a zone is not yet excessive and kswapd is still allowed to
+ * free pages here, a negative value if kswapd should leave the
+ * zone alone.
+ */
+static inline int free_plenty(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_PLENTY);
+}
+
+/*
+ * The inactive page target is the free target + 20% of (active + inactive)
+ * pages. 
+ */
+static inline int zone_inactive_limit(struct zone_struct * zone, int limit)
+{
+	int inactive, target, inactive_base;
+
+	inactive_base = zone->active_anon_pages + zone->active_cache_pages;
+	inactive_base /= INACTIVE_FACTOR;
+
+	/* GCC should optimise this away completely. */
+	if (limit == VM_MIN)
+		target = zone->pages_high + inactive_base / 2;
+	else if (limit == VM_LOW)
+		target = zone->pages_high + inactive_base;
+	else
+		target = zone->pages_high + inactive_base * 2;
+
+	inactive = zone->free_pages + zone->inactive_clean_pages
+		+ zone->inactive_dirty_pages + zone->inactive_laundry_pages;
+
+	return target - inactive;
+}
+
+static inline int inactive_limit(struct zone_struct * zone, int limit)
+{
+	int shortage = 0, local;
+
+	if (zone == ALL_ZONES) {
+		for_each_zone(zone)
+			shortage += zone_inactive_limit(zone, limit);
+	} else if (zone == ANY_ZONE) {
+		for_each_zone(zone) {
+			local = zone_inactive_limit(zone, limit);
+			shortage += max(local, 0);
+		}
+	} else {
+		shortage = zone_inactive_limit(zone, limit);
+	}
+
+	return shortage;
+}
+
+/**
+ * inactive_min - test for serious shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no serious shortage of (free + inactive clean) pages
+ */
+static inline int inactive_min(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_MIN);
+}
+
+/**
+ * inactive_low - test for shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no shortage of (free + inactive clean) pages
+ */
+static inline int inactive_low(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_LOW);
+}
+
+/**
+ * inactive_high - less than ideal amount of (free + inactive) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have more than enough (free + inactive) pages
+ */
+static inline int inactive_high(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_HIGH);
+}
+
+/*
+ * inactive_target - number of inactive pages we ought to have.
+ */
+static inline int inactive_target(void)
+{
+	int target;
+
+	target = nr_active_anon_pages() + nr_active_cache_pages()
+			+ nr_inactive_dirty_pages() + nr_inactive_clean_pages()
+			+ nr_inactive_laundry_pages();
+
+	target /= INACTIVE_FACTOR;
+
+	return target;
+}
+
+static inline void lru_lock(struct zone_struct *zone)
+{
+	if (zone) {
+		br_read_lock(BR_LRU_LOCK);
+		spin_lock(&zone->lru_lock);
+	} else {
+		br_write_lock(BR_LRU_LOCK);
+	}
+}
+
+static inline void lru_unlock(struct zone_struct *zone)
+{
+	if (zone) {
+		spin_unlock(&zone->lru_lock);
+		br_read_unlock(BR_LRU_LOCK);
+	} else {
+		br_write_unlock(BR_LRU_LOCK);
+	}
+}
+
+#endif /* _LINUX_MM_INLINE_H */
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/mmzone.h	Thu Apr 17 15:25:14 2003
@@ -13,11 +13,7 @@
  * Free memory management - zoned buddy allocator.
  */
 
-#ifndef CONFIG_FORCE_MAX_ZONEORDER
 #define MAX_ORDER 10
-#else
-#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
-#endif
 
 typedef struct free_area_struct {
 	struct list_head	free_list;
@@ -25,6 +21,16 @@
 } free_area_t;
 
 struct pglist_data;
+struct pte_chain;
+
+#define MAX_AGE 15
+#define INITIAL_AGE 3
+
+#define MAX_PER_CPU_PAGES 512
+typedef struct per_cpu_pages_s {
+	int			nr_pages, max_nr_pages;
+	struct list_head	head;
+} __attribute__((aligned(L1_CACHE_BYTES))) per_cpu_t;
 
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
@@ -38,15 +44,32 @@
 	/*
 	 * Commonly accessed fields:
 	 */
+	per_cpu_t		cpu_pages[NR_CPUS];
 	spinlock_t		lock;
 	unsigned long		free_pages;
-	unsigned long		pages_min, pages_low, pages_high;
+	unsigned long		active_anon_pages;
+	unsigned long		active_cache_pages;
+	unsigned long		inactive_dirty_pages;
+	unsigned long		inactive_laundry_pages;
+	unsigned long		inactive_clean_pages;
+	unsigned long		pages_min, pages_low, pages_high, pages_plenty;
 	int			need_balance;
+	int			need_scan;
+	int			active_anon_count[MAX_AGE+1];
+	int			active_cache_count[MAX_AGE+1];
+	unsigned char		anon_age_bias, cache_age_bias;
+	unsigned long		age_next, age_interval;
 
 	/*
 	 * free areas of different sizes
 	 */
+	struct list_head	active_anon_list[MAX_AGE+1];
+	struct list_head	active_cache_list[MAX_AGE+1];
+	struct list_head	inactive_dirty_list;
+	struct list_head	inactive_laundry_list;
+	struct list_head	inactive_clean_list;
 	free_area_t		free_area[MAX_ORDER];
+	spinlock_t		lru_lock;
 
 	/*
 	 * wait_table		-- the array holding the hash table
@@ -142,9 +165,6 @@
 
 extern int numnodes;
 extern pg_data_t *pgdat_list;
-
-#define memclass(pgzone, classzone)	(((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \
-			&& ((pgzone) <= (classzone)))
 
 /*
  * The following two are not meant for general usage. They are here as
diff -Nru a/include/linux/module.h b/include/linux/module.h
--- a/include/linux/module.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/module.h	Thu Apr 17 15:25:14 2003
@@ -287,6 +287,9 @@
 static const char __module_license[] __attribute__((section(".modinfo"))) =   \
 "license=" license
 
+#define GPL_HEADER() \
+static const char cpyright="This software may be freely redistributed under the terms of the GNU General Public License.";
+
 /* Define the module variable, and usage macros.  */
 extern struct module __this_module;
 
@@ -302,7 +305,6 @@
 static const char __module_using_checksums[] __attribute__((section(".modinfo"))) =
 "using_checksums=1";
 #endif
-
 #else /* MODULE */
 
 #define MODULE_AUTHOR(name)
@@ -311,6 +313,7 @@
 #define MODULE_SUPPORTED_DEVICE(name)
 #define MODULE_PARM(var,type)
 #define MODULE_PARM_DESC(var,desc)
+#define GPL_HEADER()
 
 /* Create a dummy reference to the table to suppress gcc unused warnings.  Put
  * the reference in the .data.exit section which is discarded when code is built
diff -Nru a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/pagemap.h	Thu Apr 17 15:25:14 2003
@@ -90,6 +90,7 @@
 extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
 
 extern void ___wait_on_page(struct page *);
+extern int wait_on_page_timeout(struct page *page, int timeout);
 
 static inline void wait_on_page(struct page * page)
 {
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/sched.h	Thu Apr 17 15:25:14 2003
@@ -235,7 +235,7 @@
 	unsigned long rss, total_vm, locked_vm;
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
-	unsigned long swap_address;
+	unsigned long rlimit_rss;
 
 	unsigned dumpable:1;
 
@@ -254,6 +254,7 @@
 	mmap_sem:	__RWSEM_INITIALIZER(name.mmap_sem), \
 	page_table_lock: SPIN_LOCK_UNLOCKED, 		\
 	mmlist:		LIST_HEAD_INIT(name.mmlist),	\
+	rlimit_rss:	RLIM_INFINITY,			\
 }
 
 struct signal_struct {
@@ -335,8 +336,6 @@
 
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
-	struct list_head local_pages;
-	unsigned int allocation_order, nr_local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
diff -Nru a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h	Thu Apr 17 15:25:14 2003
+++ b/include/linux/swap.h	Thu Apr 17 15:25:14 2003
@@ -85,8 +85,11 @@
 
 extern unsigned int nr_free_pages(void);
 extern unsigned int nr_free_buffer_pages(void);
-extern int nr_active_pages;
-extern int nr_inactive_pages;
+extern unsigned int nr_active_anon_pages(void);
+extern unsigned int nr_active_cache_pages(void);
+extern unsigned int nr_inactive_dirty_pages(void);
+extern unsigned int nr_inactive_laundry_pages(void);
+extern unsigned int nr_inactive_clean_pages(void);
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
 
@@ -102,19 +105,62 @@
 
 struct zone_t;
 
+/* linux/mm/rmap.c */
+struct pte_chain;
+extern int FASTCALL(page_referenced(struct page *, int *));
+extern struct pte_chain * FASTCALL(page_add_rmap(struct page *, pte_t *,
+					struct pte_chain *));
+extern void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+extern int FASTCALL(try_to_unmap(struct page *));
+struct pte_chain * pte_chain_alloc(int);
+void __pte_chain_free(struct pte_chain *);
+
+static inline void pte_chain_free(struct pte_chain * pte_chain)
+{
+	if (pte_chain)
+		__pte_chain_free(pte_chain);
+}
+
+/* return values of try_to_unmap */
+#define	SWAP_SUCCESS	0
+#define	SWAP_AGAIN	1
+#define	SWAP_FAIL	2
+#define	SWAP_ERROR	3
+
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
+extern void FASTCALL(lru_cache_add_dirty(struct page *));
 extern void FASTCALL(__lru_cache_del(struct page *));
 extern void FASTCALL(lru_cache_del(struct page *));
 
 extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(drop_page(struct page *));
 
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+extern struct page * FASTCALL(reclaim_page(zone_t *));
 extern wait_queue_head_t kswapd_wait;
-extern int FASTCALL(try_to_free_pages_zone(zone_t *, unsigned int));
-extern int FASTCALL(try_to_free_pages(unsigned int));
+extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask));
+extern int rebalance_laundry_zone(struct zone_struct *, int, unsigned int);
+extern void wakeup_kswapd(unsigned int);
+extern void rss_free_pages(unsigned int);
+
+/*
+ * Limits, in percent, on how large the cache can be and how to do
+ * page reclaiming.  If the cache is more than borrow% in size, we
+ * reclaim pages from the cache and won't swap out application pages.
+ * Check mm/vmscan.c for implementation details.
+ */
+struct cache_limits {
+	int min;
+	int borrow;
+	int max;
+};
+extern struct cache_limits cache_limits;
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
@@ -128,6 +174,7 @@
 extern void show_swap_cache_info(void);
 #endif
 extern int add_to_swap_cache(struct page *, swp_entry_t);
+extern int add_to_swap(struct page *);
 extern void __delete_from_swap_cache(struct page *page);
 extern void delete_from_swap_cache(struct page *page);
 extern void free_page_and_swap_cache(struct page *page);
@@ -158,49 +205,34 @@
 asmlinkage long sys_swapoff(const char *);
 asmlinkage long sys_swapon(const char *, int);
 
-extern spinlock_cacheline_t pagemap_lru_lock_cacheline;
-#define pagemap_lru_lock pagemap_lru_lock_cacheline.lock
 
 extern void FASTCALL(mark_page_accessed(struct page *));
 
 /*
+ * Page aging defines. These seem to work great in FreeBSD,
+ * no need to reinvent the wheel.
+ */
+#define PAGE_AGE_START 5
+#define PAGE_AGE_ADV 3
+#define PAGE_AGE_DECL 1
+#define PAGE_AGE_MAX 64
+
+/*
  * List add/del helper macros. These must be called
- * with the pagemap_lru_lock held!
+ * with the lru lock held!
  */
 #define DEBUG_LRU_PAGE(page)			\
 do {						\
-	if (!PageLRU(page))			\
+	if (PageActiveAnon(page))		\
 		BUG();				\
-	if (PageActive(page))			\
+	if (PageActiveCache(page))		\
+		BUG();				\
+	if (PageInactiveDirty(page))		\
+		BUG();				\
+	if (PageInactiveLaundry(page))		\
+		BUG();				\
+	if (PageInactiveClean(page))		\
 		BUG();				\
-} while (0)
-
-#define add_page_to_active_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	SetPageActive(page);			\
-	list_add(&(page)->lru, &active_list);	\
-	nr_active_pages++;			\
-} while (0)
-
-#define add_page_to_inactive_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	list_add(&(page)->lru, &inactive_list);	\
-	nr_inactive_pages++;			\
-} while (0)
-
-#define del_page_from_active_list(page)		\
-do {						\
-	list_del(&(page)->lru);			\
-	ClearPageActive(page);			\
-	nr_active_pages--;			\
-} while (0)
-
-#define del_page_from_inactive_list(page)	\
-do {						\
-	list_del(&(page)->lru);			\
-	nr_inactive_pages--;			\
 } while (0)
 
 extern spinlock_t swaplock;
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c	Thu Apr 17 15:25:14 2003
+++ b/init/main.c	Thu Apr 17 15:25:14 2003
@@ -94,6 +94,7 @@
 extern void sysctl_init(void);
 extern void signals_init(void);
 extern int init_pcmcia_ds(void);
+extern void pte_chain_init(void);
 
 extern void free_initmem(void);
 
@@ -397,6 +398,7 @@
 	mem_init();
 	kmem_cache_sizes_init();
 	pgtable_cache_init();
+	pte_chain_init();
 
 	/*
 	 * For architectures that have highmem, num_mappedpages represents
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c	Thu Apr 17 15:25:14 2003
+++ b/kernel/fork.c	Thu Apr 17 15:25:14 2003
@@ -152,7 +152,6 @@
 	mm->map_count = 0;
 	mm->rss = 0;
 	mm->cpu_vm_mask = 0;
-	mm->swap_address = 0;
 	pprev = &mm->mmap;
 
 	/*
@@ -276,9 +275,6 @@
 void mmput(struct mm_struct *mm)
 {
 	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
-		extern struct mm_struct *swap_mm;
-		if (swap_mm == mm)
-			swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 		list_del(&mm->mmlist);
 		mmlist_nr--;
 		spin_unlock(&mmlist_lock);
@@ -701,8 +697,6 @@
 #endif
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
-
-	INIT_LIST_HEAD(&p->local_pages);
 
 	retval = -ENOMEM;
 	/* copy all the process information */
diff -Nru a/kernel/sys.c b/kernel/sys.c
--- a/kernel/sys.c	Thu Apr 17 15:25:14 2003
+++ b/kernel/sys.c	Thu Apr 17 15:25:14 2003
@@ -1147,6 +1147,12 @@
 	if (resource == RLIMIT_NOFILE) {
 		if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
 			return -EPERM;
+	} else if (resource == RLIMIT_RSS && current->mm) {
+		/* rlimit is specified in bytes, convert to pages */
+		unsigned long pages = RLIM_INFINITY;
+		if (new_rlim.rlim_cur != RLIM_INFINITY)
+			pages = new_rlim.rlim_cur >> PAGE_SHIFT;
+		current->mm->rlimit_rss = pages;
 	}
 	*old_rlim = new_rlim;
 	return 0;
diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c	Thu Apr 17 15:25:14 2003
+++ b/kernel/sysctl.c	Thu Apr 17 15:25:14 2003
@@ -268,6 +268,8 @@
 	 &bdflush_min, &bdflush_max},
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
+	{VM_PAGECACHE, "pagecache", &cache_limits,
+	 sizeof(struct cache_limits), 0644, NULL, &proc_dointvec},
 	{VM_PAGERDAEMON, "kswapd",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
 	{VM_PGT_CACHE, "pagetable_cache", 
diff -Nru a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile	Thu Apr 17 15:25:14 2003
+++ b/mm/Makefile	Thu Apr 17 15:25:14 2003
@@ -14,7 +14,7 @@
 obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
-	    shmem.o
+	    shmem.o rmap.o
 
 obj-$(CONFIG_HIGHMEM) += highmem.o
 
diff -Nru a/mm/TODO b/mm/TODO
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/mm/TODO	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,38 @@
+		VM TODO list
+
+Forever valid TODO entries:
+  - keep up with the official kernel
+  - port over bugfixes
+  - minimise the diff by keeping code in sync where possible
+
+Easy short-term features:
+  - reclaim swap space from refill_inactive()
+  - simplify SMP locking 
+  - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with
+    one single function using a for_each_pte() macro
+       for_each_pte(ptep, mm, start_address, end_address)
+  - fix page_launder() to not eat horrible amounts of CPU or flush
+    all pages to disk at once
+  - better VM balancing, clean vs. dirty ratio
+  - fix loopback device deadlock
+    <akpm> riel: nr_fract=70%, nr_fract_sync=80%
+    <akpm> riel: setup a loopback fs ext2-on-ext2
+    <akpm> riel: boot with mem=64m
+    <akpm> riel: then write a 500 meg file.
+    <akpm> riel: current kernel livelocks.
+  - stabilise pte_highmem and integrate it with rmap
+  - page_cache_size per zone
+  - pte_chain list per zone
+  - get rid of other global structures/stats, make them per zone
+
+Long-term features:
+  - extensive VM statistics
+  - IO clustering for page_launder() and sync_old_buffers()
+  - readahead on per-VMA level (+ drop behind?)
+  - more graceful degradation when the load gets high
+     - reducing readahead
+     - unfair pageout so not all apps fall over
+  - memory objects, using pagecache and tmpfs for storage so
+    the memory object itself doesn't introduce any new overhead
+  - using the memory objects, removing page table copying from fork()
+  - load control able to deal with really extreme loads, swapping
diff -Nru a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c	Thu Apr 17 15:25:14 2003
+++ b/mm/filemap.c	Thu Apr 17 15:25:14 2003
@@ -22,6 +22,7 @@
 #include <linux/swapctl.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/iobuf.h>
 
 #include <asm/pgalloc.h>
@@ -54,15 +55,14 @@
 
 spinlock_cacheline_t pagecache_lock_cacheline  = {SPIN_LOCK_UNLOCKED};
 /*
- * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
+ * NOTE: to avoid deadlocking you must never acquire the lru lock 
  *	with the pagecache_lock held.
  *
  * Ordering:
  *	swap_lock ->
- *		pagemap_lru_lock ->
+ *		   lru lock ->
  *			pagecache_lock
  */
-spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
 
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
@@ -100,6 +100,8 @@
 	
 	mapping->nrpages--;
 	list_del(&page->list);
+	if (!mapping->nrpages) 
+		refile_inode(mapping->host);
 	page->mapping = NULL;
 }
 
@@ -185,7 +187,7 @@
 
 	head = &inode->i_mapping->clean_pages;
 
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(ALL_ZONES);
 	spin_lock(&pagecache_lock);
 	curr = head->next;
 
@@ -207,6 +209,7 @@
 		if (page_count(page) != 1)
 			goto unlock;
 
+		/* Manual lru del to avoid lock ordering problems */
 		__lru_cache_del(page);
 		__remove_inode_page(page);
 		UnlockPage(page);
@@ -218,7 +221,7 @@
 	}
 
 	spin_unlock(&pagecache_lock);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(ALL_ZONES);
 }
 
 static int do_flushpage(struct page *page, unsigned long offset)
@@ -239,8 +242,11 @@
 
 static void truncate_complete_page(struct page *page)
 {
-	/* Leave it on the LRU if it gets converted into anonymous buffers */
-	if (!page->buffers || do_flushpage(page, 0))
+	/*
+	 * Leave it on the LRU if it gets converted into anonymous buffers
+	 * or anonymous process memory.
+	 */
+	if ((!page->buffers || do_flushpage(page, 0)) && !page->pte.direct)
 		lru_cache_del(page);
 
 	/*
@@ -877,6 +883,32 @@
 		wake_up_all(waitqueue);
 }
 
+
+/* like wait_on_page but with a timeout (in jiffies).
+ * returns 1 on timeout 
+ */
+int wait_on_page_timeout(struct page *page, int timeout)
+{
+	wait_queue_head_t *waitqueue = page_waitqueue(page);
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	
+	if (!PageLocked(page))
+		return 0;
+
+	add_wait_queue(waitqueue, &wait);
+	do {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!PageLocked(page))
+			break;
+		sync_page(page);
+		timeout = schedule_timeout(timeout);
+	} while (PageLocked(page) && timeout);
+	__set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(waitqueue, &wait);
+	return PageLocked(page);
+}
+
 /*
  * Get a lock on the page, assuming we need to sleep
  * to get it..
@@ -1032,6 +1064,54 @@
 }
 
 /*
+ * We combine this with read-ahead to deactivate pages when we
+ * think there's sequential IO going on. Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * TODO:
+ * - make the readahead code smarter
+ * - move readahead to the VMA level so we can do the same
+ *   trick with mmap()
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long start;
+
+	/* Nothing to drop-behind if we're on the first page. */
+	if (!index)
+		return;
+
+	if (index > file->f_rawin)
+		start = index - file->f_rawin;
+	else
+		start = 0;
+
+	/*
+	 * Go backwards from index-1 and drop all pages in the
+	 * readahead window. Since the readahead window may have
+	 * been increased since the last time we were called, we
+	 * stop when the page isn't there.
+	 */
+	lru_lock(ALL_ZONES);
+	while (--index >= start) {
+		struct page **hash = page_hash(mapping, index);
+		spin_lock(&pagecache_lock);
+		page = __find_page_nolock(mapping, index, *hash);
+		spin_unlock(&pagecache_lock);
+		if (!page || !PageActiveCache(page))
+			break;
+		drop_page(page);
+	}
+	lru_unlock(ALL_ZONES);
+}
+
+/*
  * Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
@@ -1302,6 +1382,12 @@
 		if (filp->f_ramax > max_readahead)
 			filp->f_ramax = max_readahead;
 
+		/*
+		 * Move the pages that have already been passed
+		 * to the inactive list.
+		 */
+		drop_behind(filp, index);
+
 #ifdef PROFILE_READAHEAD
 		profile_readahead((reada_ok == 2), filp);
 #endif
@@ -1313,16 +1399,23 @@
 /*
  * Mark a page as having seen activity.
  *
- * If it was already so marked, move it to the active queue and drop
- * the referenced bit.  Otherwise, just mark it for future action..
+ * We immediately reclaim the inactive clean pages because those are
+ * counted as freeable.  We don't modify the inactive dirty ones because
+ * we're never sure if those are freeable anyway.
  */
 void mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page)) {
+	/* Mark the page referenced, AFTER checking for previous usage.. */
+	SetPageReferenced(page);
+
+	if (unlikely(PageInactiveClean(page) || PageInactiveLaundry(page))) {
+		struct zone_struct *zone = page_zone(page);
+		int free = zone->free_pages + zone->inactive_clean_pages;
+
 		activate_page(page);
-		ClearPageReferenced(page);
-	} else
-		SetPageReferenced(page);
+		if (free <= zone->pages_min)
+			wakeup_kswapd(GFP_NOIO);
+	}
 }
 
 /*
@@ -1860,7 +1953,7 @@
 		nr = max;
 
 	/* And limit it to a sane percentage of the inactive list.. */
-	max = nr_inactive_pages / 2;
+	max = (nr_inactive_clean_pages() + nr_inactive_laundry_pages()) / 2;
 	if (nr > max)
 		nr = max;
 
@@ -2108,7 +2201,8 @@
 		struct page *page = pte_page(pte);
 		if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
 			flush_tlb_page(vma, address);
-			set_page_dirty(page);
+			set_page_dirty(page);	/* This actually does not sleep */
+			return 0;
 		}
 	}
 	return 0;
@@ -2118,7 +2212,7 @@
 	unsigned long address, unsigned long size, 
 	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 {
-	pte_t * pte;
+	pte_t *pte, *mapping;
 	unsigned long end;
 	int error;
 
@@ -2129,7 +2223,7 @@
 		pmd_clear(pmd);
 		return 0;
 	}
-	pte = pte_offset(pmd, address);
+	mapping = pte = pte_offset_map(pmd, address);
 	offset += address & PMD_MASK;
 	address &= ~PMD_MASK;
 	end = address + size;
@@ -2141,6 +2235,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(mapping);
 	return error;
 }
 
@@ -3081,6 +3176,7 @@
 		unsigned long index, offset;
 		long page_fault;
 		char *kaddr;
+		int deactivate = 1;
 
 		/*
 		 * Try to find the page in the cache. If it isn't there,
@@ -3089,8 +3185,10 @@
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
+		if (bytes > count) {
 			bytes = count;
+			deactivate = 0;
+		}
 
 		/*
 		 * Bring in the user page that we will copy from _first_.
@@ -3134,8 +3232,11 @@
 unlock:
 		kunmap(page);
 		/* Mark it unlocked again and drop the page.. */
-		SetPageReferenced(page);
 		UnlockPage(page);
+		if (deactivate)
+			deactivate_page(page);
+		else
+			mark_page_accessed(page);
 		page_cache_release(page);
 
 		if (status < 0)
diff -Nru a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c	Thu Apr 17 15:25:14 2003
+++ b/mm/memory.c	Thu Apr 17 15:25:14 2003
@@ -45,8 +45,10 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/module.h>
+#include <linux/mm_inline.h>
 
 #include <asm/pgalloc.h>
+#include <asm/rmap.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 
@@ -92,7 +94,7 @@
  */
 static inline void free_one_pmd(pmd_t * dir)
 {
-	pte_t * pte;
+	struct page *pte;
 
 	if (pmd_none(*dir))
 		return;
@@ -101,8 +103,9 @@
 		pmd_clear(dir);
 		return;
 	}
-	pte = pte_offset(dir, 0);
+	pte = pmd_page(*dir);
 	pmd_clear(dir);
+	pgtable_remove_rmap(pte);
 	pte_free(pte);
 }
 
@@ -138,6 +141,62 @@
 	return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
 }
 
+pte_t *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		struct page *new;
+
+		new = pte_alloc_one_fast(mm, address);
+		if (!new) {
+			spin_unlock(&mm->page_table_lock);
+			new = pte_alloc_one(mm, address);
+			spin_lock(&mm->page_table_lock);
+		}
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free(new);
+			goto out;
+		}
+		pgtable_add_rmap(new, mm, address);
+		pmd_populate(mm, pmd, new);
+	}
+out:
+	if (pmd_present(*pmd))
+		return pte_offset_map(pmd, address);
+	return NULL;
+}
+
+pte_t *pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		pte_t *new;
+
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one_kernel(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free_kernel(new);
+			goto out;
+		}
+		pmd_populate_kernel(mm, pmd, new);
+	}
+out:
+	return pte_offset_kernel(pmd, address);
+}
+
 
 /*
  * This function clears all user-level page tables of a process - this
@@ -171,7 +230,7 @@
  *         variable count and make things faster. -jj
  *
  * dst->page_table_lock is held on entry and exit,
- * but may be dropped within pmd_alloc() and pte_alloc().
+ * but may be dropped within pmd_alloc() and pte_alloc_map().
  */
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
@@ -180,6 +239,16 @@
 	unsigned long address = vma->vm_start;
 	unsigned long end = vma->vm_end;
 	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+	struct pte_chain * pte_chain = NULL;
+
+	pte_chain = pte_chain_alloc(GFP_ATOMIC);
+	if (!pte_chain) {
+		spin_unlock(&dst->page_table_lock);
+		pte_chain = pte_chain_alloc(GFP_KERNEL);
+		spin_lock(&dst->page_table_lock);
+		if (!pte_chain)
+			goto nomem;
+	}
 
 	src_pgd = pgd_offset(src, address)-1;
 	dst_pgd = pgd_offset(dst, address)-1;
@@ -223,12 +292,12 @@
 				goto cont_copy_pmd_range;
 			}
 
-			src_pte = pte_offset(src_pmd, address);
-			dst_pte = pte_alloc(dst, dst_pmd, address);
+			dst_pte = pte_alloc_map(dst, dst_pmd, address);
 			if (!dst_pte)
 				goto nomem;
 
 			spin_lock(&src->page_table_lock);			
+			src_pte = pte_offset_map_nested(src_pmd, address);
 			do {
 				pte_t pte = *src_pte;
 				struct page *ptepage;
@@ -237,9 +306,11 @@
 
 				if (pte_none(pte))
 					goto cont_copy_pte_range_noset;
+				/* pte contains position in swap, so copy. */
 				if (!pte_present(pte)) {
 					swap_duplicate(pte_to_swp_entry(pte));
-					goto cont_copy_pte_range;
+					set_pte(dst_pte, pte);
+					goto cont_copy_pte_range_noset;
 				}
 				ptepage = pte_page(pte);
 				if ((!VALID_PAGE(ptepage)) || 
@@ -260,23 +331,53 @@
 				dst->rss++;
 
 cont_copy_pte_range:		set_pte(dst_pte, pte);
+				pte_chain = page_add_rmap(ptepage, dst_pte,
+							pte_chain);
+				if (pte_chain)
+					goto cont_copy_pte_range_noset;
+				pte_chain = pte_chain_alloc(GFP_ATOMIC);
+				if (pte_chain)
+					goto cont_copy_pte_range_noset;
+
+				/*
+				 * pte_chain allocation failed, and we need to
+				 * run page reclaim.
+				 */
+				pte_unmap_nested(src_pte);
+				pte_unmap(dst_pte);
+				spin_unlock(&src->page_table_lock);
+				spin_unlock(&dst->page_table_lock);
+				pte_chain = pte_chain_alloc(GFP_KERNEL);
+				spin_lock(&dst->page_table_lock);
+				if (!pte_chain)
+					goto nomem;
+				spin_lock(&src->page_table_lock);
+				dst_pte = pte_offset_map(dst_pmd, address);
+				src_pte = pte_offset_map_nested(src_pmd,
+								address);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
-				if (address >= end)
-					goto out_unlock;
+				if (address >= end) {
+					pte_unmap_nested(src_pte);
+					pte_unmap(dst_pte);
+					spin_unlock(&src->page_table_lock);
+					goto out;
+				}
 				src_pte++;
 				dst_pte++;
 			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+			pte_unmap_nested(src_pte-1);
+			pte_unmap(dst_pte-1);
 			spin_unlock(&src->page_table_lock);
 		
 cont_copy_pmd_range:	src_pmd++;
 			dst_pmd++;
 		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 	}
-out_unlock:
-	spin_unlock(&src->page_table_lock);
 out:
+	pte_chain_free(pte_chain);
 	return 0;
 nomem:
+	pte_chain_free(pte_chain);
 	return -ENOMEM;
 }
 
@@ -294,7 +395,7 @@
 static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 {
 	unsigned long offset;
-	pte_t * ptep;
+	pte_t * ptep, *mapping;
 	int freed = 0;
 
 	if (pmd_none(*pmd))
@@ -304,7 +405,7 @@
 		pmd_clear(pmd);
 		return 0;
 	}
-	ptep = pte_offset(pmd, address);
+	mapping = ptep = pte_offset_map(pmd, address);
 	offset = address & ~PMD_MASK;
 	if (offset + size > PMD_SIZE)
 		size = PMD_SIZE - offset;
@@ -315,8 +416,10 @@
 			continue;
 		if (pte_present(pte)) {
 			struct page *page = pte_page(pte);
-			if (VALID_PAGE(page) && !PageReserved(page))
+			if (VALID_PAGE(page) && !PageReserved(page)) {
 				freed ++;
+				page_remove_rmap(page, ptep);
+			}
 			/* This will eventually call __free_pte on the pte. */
 			tlb_remove_page(tlb, ptep, address + offset);
 		} else {
@@ -324,6 +427,7 @@
 			pte_clear(ptep);
 		}
 	}
+	pte_unmap(mapping);
 
 	return freed;
 }
@@ -354,49 +458,65 @@
 	return freed;
 }
 
-/*
- * remove user pages in a given range.
+#define ZAP_BLOCK_SIZE	(256 * PAGE_SIZE)
+
+/**
+ * zap_page_range - remove user pages in a given range
+ * @mm: mm_struct containing the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
  */
 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 {
 	mmu_gather_t *tlb;
 	pgd_t * dir;
-	unsigned long start = address, end = address + size;
-	int freed = 0;
-
-	dir = pgd_offset(mm, address);
-
+	unsigned long start, end, addr, block;
+	int freed;
+ 
 	/*
-	 * This is a long-lived spinlock. That's fine.
-	 * There's no contention, because the page table
-	 * lock only protects against kswapd anyway, and
-	 * even if kswapd happened to be looking at this
-	 * process we _want_ it to get stuck.
+	 * Break the work up into blocks of ZAP_BLOCK_SIZE pages:
+	 * this decreases lock-hold time for the page_table_lock
+	 * dramatically, which could otherwise be held for a very
+	 * long time.  This decreases lock contention and increases
+	 * periods of preemptibility.
 	 */
-	if (address >= end)
-		BUG();
-	spin_lock(&mm->page_table_lock);
-	flush_cache_range(mm, address, end);
-	tlb = tlb_gather_mmu(mm);
+	while (size) {
+		if (size > ZAP_BLOCK_SIZE)
+			block = ZAP_BLOCK_SIZE;
+		else
+			block = size;
+
+		freed = 0;
+		start = addr = address;
+		end = address + block;
+		dir = pgd_offset(mm, address);
 
-	do {
-		freed += zap_pmd_range(tlb, dir, address, end - address);
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-		dir++;
-	} while (address && (address < end));
+		BUG_ON(address >= end);
 
-	/* this will flush any remaining tlb entries */
-	tlb_finish_mmu(tlb, start, end);
+		spin_lock(&mm->page_table_lock);
+		flush_cache_range(mm, start, end);
+		tlb = tlb_gather_mmu(mm);
 
-	/*
-	 * Update rss for the mm_struct (not necessarily current->mm)
-	 * Notice that rss is an unsigned long.
-	 */
-	if (mm->rss > freed)
-		mm->rss -= freed;
-	else
-		mm->rss = 0;
-	spin_unlock(&mm->page_table_lock);
+		do {
+			freed += zap_pmd_range(tlb, dir, addr, end - addr);
+			addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
+			dir++;
+		} while (addr && (addr < end));
+
+		/* this will flush any remaining tlb entries */
+		tlb_finish_mmu(tlb, start, end);
+
+		/* Update rss for the mm_struct (need not be current->mm) */
+		if (mm->rss > freed)
+			mm->rss -= freed;
+		else
+			mm->rss = 0;
+
+		spin_unlock(&mm->page_table_lock);
+
+		address += block;
+		size -= block;
+	}
 }
 
 /*
@@ -407,6 +527,7 @@
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
+	struct page *page = NULL;
 
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || pgd_bad(*pgd))
@@ -416,19 +537,19 @@
 	if (pmd_none(*pmd) || pmd_bad(*pmd))
 		goto out;
 
-	ptep = pte_offset(pmd, address);
+	ptep = pte_offset_map(pmd, address);
 	if (!ptep)
 		goto out;
 
 	pte = *ptep;
+	pte_unmap(ptep);
 	if (pte_present(pte)) {
 		if (!write ||
 		    (pte_write(pte) && pte_dirty(pte)))
-			return pte_page(pte);
+			page = pte_page(pte);
 	}
-
 out:
-	return 0;
+	return page;
 }
 
 /* 
@@ -777,10 +898,11 @@
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		zeromap_pte_range(pte, address, end - address, prot);
+		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -849,18 +971,20 @@
 static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
 	unsigned long phys_addr, pgprot_t prot)
 {
-	unsigned long end;
+	unsigned long base, end;
 
+	base = address & PGDIR_MASK;
 	address &= ~PGDIR_MASK;
 	end = address + size;
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	phys_addr -= address;
 	do {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address + base);
 		if (!pte)
 			return -ENOMEM;
-		remap_pte_range(pte, address, end - address, address + phys_addr, prot);
+		remap_pte_range(pte, base + address, end - address, address + phys_addr, prot);
+		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -946,9 +1070,10 @@
  * with the page_table_lock released.
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
-	unsigned long address, pte_t *page_table, pte_t pte)
+	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
 	struct page *old_page, *new_page;
+	struct pte_chain * pte_chain = NULL;
 
 	old_page = pte_page(pte);
 	if (!VALID_PAGE(old_page))
@@ -960,10 +1085,12 @@
 		if (reuse) {
 			flush_cache_page(vma, address);
 			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			return 1;	/* Minor fault */
 		}
 	}
+	pte_unmap(page_table);
 
 	/*
 	 * Ok, we need to copy. Oh, well..
@@ -971,6 +1098,9 @@
 	page_cache_get(old_page);
 	spin_unlock(&mm->page_table_lock);
 
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		goto no_mem;
 	new_page = alloc_page(GFP_HIGHUSER);
 	if (!new_page)
 		goto no_mem;
@@ -980,26 +1110,33 @@
 	 * Re-check the pte - we dropped the lock
 	 */
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
+		page_remove_rmap(old_page, page_table);
 		break_cow(vma, new_page, address, page_table);
+		pte_chain = page_add_rmap(new_page, page_table, pte_chain);
 		lru_cache_add(new_page);
 
 		/* Free the old page.. */
 		new_page = old_page;
 	}
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
+	pte_chain_free(pte_chain);
 	return 1;	/* Minor fault */
 
 bad_wp_page:
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
 	return -1;
 no_mem:
 	page_cache_release(old_page);
+	pte_chain_free(pte_chain);
 	return -1;
 }
 
@@ -1096,6 +1233,10 @@
 	struct page *new_page;
 	unsigned long offset;
 
+	/* Low on free memory ?  Don't make things worse. */
+	if (free_low(ALL_ZONES) < 0)
+		return;
+
 	/*
 	 * Get the number of handles we should do readahead io to.
 	 */
@@ -1116,13 +1257,15 @@
  */
 static int do_swap_page(struct mm_struct * mm,
 	struct vm_area_struct * vma, unsigned long address,
-	pte_t * page_table, pte_t orig_pte, int write_access)
+	pte_t * page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
 {
 	struct page *page;
 	swp_entry_t entry = pte_to_swp_entry(orig_pte);
+	struct pte_chain * pte_chain = NULL;
 	pte_t pte;
 	int ret = 1;
 
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	page = lookup_swap_cache(entry);
 	if (!page) {
@@ -1135,7 +1278,9 @@
 			 */
 			int retval;
 			spin_lock(&mm->page_table_lock);
+			page_table = pte_offset_map(pmd, address);
 			retval = pte_same(*page_table, orig_pte) ? -1 : 1;
+			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			return retval;
 		}
@@ -1145,7 +1290,11 @@
 	}
 
 	mark_page_accessed(page);
-
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain) {
+		page_cache_release(page);
+		return -1;
+	}
 	lock_page(page);
 
 	/*
@@ -1153,10 +1302,13 @@
 	 * released the page table lock.
 	 */
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
 	if (!pte_same(*page_table, orig_pte)) {
+		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
 		unlock_page(page);
 		page_cache_release(page);
+		pte_chain_free(pte_chain);
 		return 1;
 	}
 
@@ -1175,10 +1327,13 @@
 	flush_page_to_ram(page);
 	flush_icache_page(vma, page);
 	set_pte(page_table, pte);
+	pte_chain = page_add_rmap(page, page_table, pte_chain);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
+	pte_chain_free(pte_chain);
 	return ret;
 }
 
@@ -1187,18 +1342,31 @@
  * spinlock held to protect against concurrent faults in
  * multithreaded programs. 
  */
-static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
+static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr)
 {
 	pte_t entry;
+	struct page * page = ZERO_PAGE(addr);
+	struct pte_chain * pte_chain;
+	int ret;
+
+	pte_chain = pte_chain_alloc(GFP_ATOMIC);
+	if (!pte_chain) {
+		pte_unmap(page_table);
+		spin_unlock(&mm->page_table_lock);
+		pte_chain = pte_chain_alloc(GFP_KERNEL);
+		if (!pte_chain)
+			goto no_mem;
+		spin_lock(&mm->page_table_lock);
+		page_table = pte_offset_map(pmd, addr);
+	}
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 
 	/* ..except if it's a write access */
 	if (write_access) {
-		struct page *page;
-
 		/* Allocate our own private page. */
+		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
 
 		page = alloc_page(GFP_HIGHUSER);
@@ -1207,27 +1375,36 @@
 		clear_user_highpage(page, addr);
 
 		spin_lock(&mm->page_table_lock);
+		page_table = pte_offset_map(pmd, addr);
 		if (!pte_none(*page_table)) {
 			page_cache_release(page);
+			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
-			return 1;
+			ret = 1;
+			goto out;
 		}
 		mm->rss++;
 		flush_page_to_ram(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		lru_cache_add(page);
-		mark_page_accessed(page);
 	}
 
 	set_pte(page_table, entry);
+	/* ignores ZERO PAGE */
+	pte_chain = page_add_rmap(page, page_table, pte_chain);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
-	return 1;	/* Minor fault */
+	ret = 1;	/* Minor fault */
+	goto out;
 
 no_mem:
-	return -1;
+	ret = -1;
+out:
+	pte_chain_free(pte_chain);
+	return ret;
 }
 
 /*
@@ -1243,13 +1420,15 @@
  * spinlock held. Exit with the spinlock released.
  */
 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
-	unsigned long address, int write_access, pte_t *page_table)
+	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
+	struct pte_chain * pte_chain;
 	struct page * new_page;
 	pte_t entry;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
-		return do_anonymous_page(mm, vma, page_table, write_access, address);
+		return do_anonymous_page(mm, vma, page_table, pmd, write_access, address);
+	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
@@ -1274,7 +1453,15 @@
 		new_page = page;
 	}
 
+	mark_page_accessed(new_page);
+
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain) {
+		page_cache_release(new_page);
+		return -1;
+	}
 	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
@@ -1294,16 +1481,21 @@
 		if (write_access)
 			entry = pte_mkwrite(pte_mkdirty(entry));
 		set_pte(page_table, entry);
+		pte_chain = page_add_rmap(new_page, page_table, pte_chain);
+		pte_unmap(page_table);
 	} else {
 		/* One of our sibling threads was faster, back out. */
+		pte_unmap(page_table);
 		page_cache_release(new_page);
 		spin_unlock(&mm->page_table_lock);
+		pte_chain_free(pte_chain);
 		return 1;
 	}
 
 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
 	spin_unlock(&mm->page_table_lock);
+	pte_chain_free(pte_chain);
 	return 2;	/* Major fault */
 }
 
@@ -1330,7 +1522,7 @@
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 	struct vm_area_struct * vma, unsigned long address,
-	int write_access, pte_t * pte)
+	int write_access, pte_t *pte, pmd_t *pmd)
 {
 	pte_t entry;
 
@@ -1342,18 +1534,19 @@
 		 * drop the lock.
 		 */
 		if (pte_none(entry))
-			return do_no_page(mm, vma, address, write_access, pte);
-		return do_swap_page(mm, vma, address, pte, entry, write_access);
+			return do_no_page(mm, vma, address, write_access, pte, pmd);
+		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
 	}
 
 	if (write_access) {
 		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address, pte, entry);
+			return do_wp_page(mm, vma, address, pte, pmd, entry);
 
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
 	establish_pte(vma, address, pte, entry);
+	pte_unmap(pte);
 	spin_unlock(&mm->page_table_lock);
 	return 1;
 }
@@ -1370,6 +1563,14 @@
 	current->state = TASK_RUNNING;
 	pgd = pgd_offset(mm, address);
 
+	/* 
+	 * If we are over our RSS limit and the system needs memory,
+	 * we will free memory for the non-hogs and slow down a bit.
+	 */
+	if (mm->rlimit_rss && mm->rss > mm->rlimit_rss &&
+					free_high(ALL_ZONES) > 0)
+		rss_free_pages(GFP_HIGHUSER);
+
 	/*
 	 * We need the page table lock to synchronize with kswapd
 	 * and the SMP-safe atomic PTE updates.
@@ -1378,9 +1579,9 @@
 	pmd = pmd_alloc(mm, pgd, address);
 
 	if (pmd) {
-		pte_t * pte = pte_alloc(mm, pmd, address);
+		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (pte)
-			return handle_pte_fault(mm, vma, address, write_access, pte);
+			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
 	}
 	spin_unlock(&mm->page_table_lock);
 	return -1;
@@ -1422,41 +1623,6 @@
 	return pmd_offset(pgd, address);
 }
 
-/*
- * Allocate the page table directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
- */
-pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
-{
-	if (pmd_none(*pmd)) {
-		pte_t *new;
-
-		/* "fast" allocation can happen without dropping the lock.. */
-		new = pte_alloc_one_fast(mm, address);
-		if (!new) {
-			spin_unlock(&mm->page_table_lock);
-			new = pte_alloc_one(mm, address);
-			spin_lock(&mm->page_table_lock);
-			if (!new)
-				return NULL;
-
-			/*
-			 * Because we dropped the lock, we should re-check the
-			 * entry, as somebody else could have populated it..
-			 */
-			if (!pmd_none(*pmd)) {
-				pte_free(new);
-				goto out;
-			}
-		}
-		pmd_populate(mm, pmd, new);
-	}
-out:
-	return pte_offset(pmd, address);
-}
-
 int make_pages_present(unsigned long addr, unsigned long end)
 {
 	int ret, len, write;
@@ -1486,10 +1652,12 @@
 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, addr);
 		if (!pmd_none(*pmd)) {
-			pte = pte_offset(pmd, addr);
+			/* FIXME: shouldn't this be pte_offset_kernel ??? */
+			pte = pte_offset_map(pmd, addr);
 			if (pte_present(*pte)) {
 				page = pte_page(*pte);
 			}
+			pte_unmap(pte);
 		}
 	}
 	return page;
diff -Nru a/mm/mprotect.c b/mm/mprotect.c
--- a/mm/mprotect.c	Thu Apr 17 15:25:14 2003
+++ b/mm/mprotect.c	Thu Apr 17 15:25:14 2003
@@ -7,6 +7,7 @@
 #include <linux/smp_lock.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -15,7 +16,7 @@
 static inline void change_pte_range(pmd_t * pmd, unsigned long address,
 	unsigned long size, pgprot_t newprot)
 {
-	pte_t * pte;
+	pte_t *pte, *mapping;
 	unsigned long end;
 
 	if (pmd_none(*pmd))
@@ -25,7 +26,7 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
+	mapping = pte = pte_offset_map(pmd, address);
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -44,6 +45,7 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(mapping);
 }
 
 static inline void change_pmd_range(pgd_t * pgd, unsigned long address,
diff -Nru a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c	Thu Apr 17 15:25:14 2003
+++ b/mm/mremap.c	Thu Apr 17 15:25:14 2003
@@ -9,13 +9,14 @@
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
 extern int vm_enough_memory(long pages);
 
-static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
+static inline pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t * pgd;
 	pmd_t * pmd;
@@ -39,30 +40,55 @@
 		goto end;
 	}
 
-	pte = pte_offset(pmd, addr);
-	if (pte_none(*pte))
+	pte = pte_offset_map_nested(pmd, addr);
+	if (pte_none(*pte)) {
+		pte_unmap_nested(pte);
 		pte = NULL;
+	}
 end:
 	return pte;
 }
 
-static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
+#ifdef CONFIG_HIGHPTE	/* Save a few cycles on the sane machines */
+static inline int page_table_present(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return 0;
+	pmd = pmd_offset(pgd, addr);
+	return pmd_present(*pmd);
+}
+#else
+#define page_table_present(mm, addr)	(1)
+#endif
+
+static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
 {
 	pmd_t * pmd;
 	pte_t * pte = NULL;
 
 	pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr);
 	if (pmd)
-		pte = pte_alloc(mm, pmd, addr);
+		pte = pte_alloc_map(mm, pmd, addr);
 	return pte;
 }
 
-static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst)
+static int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst,
+			struct pte_chain ** pte_chainp)
 {
 	int error = 0;
 	pte_t pte;
+	struct page * page = NULL;
+
+	if (pte_present(*src))
+		page = pte_page(*src);
 
 	if (!pte_none(*src)) {
+		if (page)
+			page_remove_rmap(page, src);
 		pte = ptep_get_and_clear(src);
 		if (!dst) {
 			/* No dest?  We must put it back. */
@@ -70,29 +96,53 @@
 			error++;
 		}
 		set_pte(dst, pte);
+		if (page)
+			*pte_chainp = page_add_rmap(page, dst, *pte_chainp);
 	}
 	return error;
 }
 
-static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr)
+static int move_one_page(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr)
 {
+	struct mm_struct *mm = vma->vm_mm;
+	struct pte_chain * pte_chain;
 	int error = 0;
-	pte_t * src;
+	pte_t *src, *dst;
 
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		return -1;
 	spin_lock(&mm->page_table_lock);
-	src = get_one_pte(mm, old_addr);
-	if (src)
-		error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr));
+	src = get_one_pte_map_nested(mm, old_addr);
+	if (src) {
+		/*
+		 * Look to see whether alloc_one_pte_map needs to perform a
+		 * memory allocation.  If it does then we need to drop the
+		 * atomic kmap
+		 */
+		if (!page_table_present(mm, new_addr)) {
+			pte_unmap_nested(src);
+			src = NULL;
+		}
+		dst = alloc_one_pte_map(mm, new_addr);
+		if (src == NULL)
+			src = get_one_pte_map_nested(mm, old_addr);
+		error = copy_one_pte(mm, src, dst, &pte_chain);
+		pte_unmap_nested(src);
+		pte_unmap(dst);
+	}
+	flush_tlb_page(vma, old_addr);
 	spin_unlock(&mm->page_table_lock);
+	pte_chain_free(pte_chain);
 	return error;
 }
 
-static int move_page_tables(struct mm_struct * mm,
+static int move_page_tables(struct vm_area_struct *vma,
 	unsigned long new_addr, unsigned long old_addr, unsigned long len)
 {
 	unsigned long offset = len;
 
-	flush_cache_range(mm, old_addr, old_addr + len);
+	flush_cache_range(vma, old_addr, old_addr + len);
 
 	/*
 	 * This is not the clever way to do this, but we're taking the
@@ -101,10 +151,9 @@
 	 */
 	while (offset) {
 		offset -= PAGE_SIZE;
-		if (move_one_page(mm, old_addr + offset, new_addr + offset))
+		if (move_one_page(vma, old_addr + offset, new_addr + offset))
 			goto oops_we_failed;
 	}
-	flush_tlb_range(mm, old_addr, old_addr + len);
 	return 0;
 
 	/*
@@ -115,14 +164,14 @@
 	 * the old page tables)
 	 */
 oops_we_failed:
-	flush_cache_range(mm, new_addr, new_addr + len);
+	flush_cache_range(vma, new_addr, new_addr + len);
 	while ((offset += PAGE_SIZE) < len)
-		move_one_page(mm, new_addr + offset, old_addr + offset);
-	zap_page_range(mm, new_addr, len);
+		move_one_page(vma, new_addr + offset, old_addr + offset);
+	zap_page_range(vma->vm_mm, new_addr, len);
 	return -1;
 }
 
-static inline unsigned long move_vma(struct vm_area_struct * vma,
+static unsigned long move_vma(struct vm_area_struct * vma,
 	unsigned long addr, unsigned long old_len, unsigned long new_len,
 	unsigned long new_addr)
 {
@@ -146,7 +195,8 @@
 				prev->vm_end = next->vm_end;
 				__vma_unlink(mm, next, prev);
 				spin_unlock(&mm->page_table_lock);
-
+				if (vma == next)
+					vma = prev;
 				mm->map_count--;
 				kmem_cache_free(vm_area_cachep, next);
 			}
@@ -176,7 +226,7 @@
 		allocated_vma = 1;
 	}
 
-	if (!move_page_tables(current->mm, new_addr, addr, old_len)) {
+	if (!move_page_tables(vma, new_addr, addr, old_len)) {
 		if (allocated_vma) {
 			*new_vma = *vma;
 			new_vma->vm_start = new_addr;
@@ -252,12 +302,14 @@
 	/*
 	 * Always allow a shrinking remap: that just unmaps
 	 * the unnecessary pages..
+	 * do_munmap does all the needed commit accounting
 	 */
 	ret = addr;
 	if (old_len >= new_len) {
 		do_munmap(current->mm, addr+new_len, old_len - new_len);
 		if (!(flags & MREMAP_FIXED) || (new_addr == addr))
 			goto out;
+		old_len = new_len;
 	}
 
 	/*
diff -Nru a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c	Thu Apr 17 15:25:14 2003
+++ b/mm/oom_kill.c	Thu Apr 17 15:25:14 2003
@@ -168,6 +168,7 @@
 static void oom_kill(void)
 {
 	struct task_struct *p, *q;
+	extern wait_queue_head_t kswapd_done;
 
 	read_lock(&tasklist_lock);
 	p = select_bad_process();
@@ -183,6 +184,9 @@
 	}
 	read_unlock(&tasklist_lock);
 
+	/* Chances are by this time our victim is sleeping on kswapd. */
+	wake_up(&kswapd_done);
+
 	/*
 	 * Make kswapd go out of the way, so "p" has a good chance of
 	 * killing itself before someone else gets the chance to ask
@@ -199,12 +203,6 @@
 {
 	static unsigned long first, last, count, lastkill;
 	unsigned long now, since;
-
-	/*
-	 * Enough swap space left?  Not OOM.
-	 */
-	if (nr_swap_pages > 0)
-		return;
 
 	now = jiffies;
 	since = now - last;
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	Thu Apr 17 15:25:14 2003
+++ b/mm/page_alloc.c	Thu Apr 17 15:25:14 2003
@@ -10,6 +10,7 @@
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per-CPU page pool, Ingo Molnar, Red Hat, 2001, 2002
  */
 
 #include <linux/config.h>
@@ -21,12 +22,10 @@
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/mm_inline.h>
+#include <linux/smp.h>
 
 int nr_swap_pages;
-int nr_active_pages;
-int nr_inactive_pages;
-LIST_HEAD(inactive_list);
-LIST_HEAD(active_list);
 pg_data_t *pgdat_list;
 
 /*
@@ -42,6 +41,8 @@
 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, };
+static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, };
 
 /*
  * Temporary debugging check.
@@ -84,6 +85,7 @@
 	unsigned long index, page_idx, mask, flags;
 	free_area_t *area;
 	struct page *base;
+	per_cpu_t *per_cpu;
 	zone_t *zone;
 
 	/*
@@ -98,20 +100,30 @@
 
 	if (page->buffers)
 		BUG();
-	if (page->mapping)
+	if (page->mapping) {
+		printk(KERN_CRIT "Page has mapping still set. This is a serious situation. However if you \n");
+		printk(KERN_CRIT "are using the NVidia binary only module please report this bug to \n");
+		printk(KERN_CRIT "NVidia and not to the linux kernel mailinglist.\n");
 		BUG();
+	}
 	if (!VALID_PAGE(page))
 		BUG();
 	if (PageLocked(page))
 		BUG();
-	if (PageActive(page))
+	if (PageActiveAnon(page))
+		BUG();
+	if (PageActiveCache(page))
+		BUG();
+	if (PageInactiveDirty(page))
+		BUG();
+	if (PageInactiveLaundry(page))
+		BUG();
+	if (PageInactiveClean(page))
+		BUG();
+	if (page->pte.direct)
 		BUG();
 	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
-
-	if (current->flags & PF_FREE_PAGES)
-		goto local_freelist;
- back_local_freelist:
-
+	
 	zone = page_zone(page);
 
 	mask = (~0UL) << order;
@@ -123,7 +135,18 @@
 
 	area = zone->free_area + order;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	per_cpu = zone->cpu_pages + smp_processor_id();
+
+	__save_flags(flags);
+	__cli();
+	if (!order && (per_cpu->nr_pages < per_cpu->max_nr_pages) && (free_high(zone) <= 0)) {
+		list_add(&page->list, &per_cpu->head);
+		per_cpu->nr_pages++;
+		__restore_flags(flags);
+		return;
+	}
+
+	spin_lock(&zone->lock);
 
 	zone->free_pages -= mask;
 
@@ -158,17 +181,6 @@
 	list_add(&(base + page_idx)->list, &area->free_list);
 
 	spin_unlock_irqrestore(&zone->lock, flags);
-	return;
-
- local_freelist:
-	if (current->nr_local_pages)
-		goto back_local_freelist;
-	if (in_interrupt())
-		goto back_local_freelist;		
-
-	list_add(&page->list, &current->local_pages);
-	page->index = order;
-	current->nr_local_pages++;
 }
 
 #define MARK_USED(index, order, area) \
@@ -198,13 +210,32 @@
 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
 static struct page * rmqueue(zone_t *zone, unsigned int order)
 {
+	per_cpu_t *per_cpu = zone->cpu_pages + smp_processor_id();
 	free_area_t * area = zone->free_area + order;
 	unsigned int curr_order = order;
 	struct list_head *head, *curr;
 	unsigned long flags;
 	struct page *page;
+	int threshold = 0;
+
+	if (!(current->flags & PF_MEMALLOC))
+		 threshold = (per_cpu->max_nr_pages / 8);
+	__save_flags(flags);
+	__cli();
 
-	spin_lock_irqsave(&zone->lock, flags);
+	if (!order && (per_cpu->nr_pages>threshold)) {
+		if (unlikely(list_empty(&per_cpu->head)))
+			BUG();
+		page = list_entry(per_cpu->head.next, struct page, list);
+		list_del(&page->list);
+		per_cpu->nr_pages--;
+		__restore_flags(flags);
+ 
+		set_page_count(page, 1);
+		return page;
+	}
+ 
+ 	spin_lock(&zone->lock);
 	do {
 		head = &area->free_list;
 		curr = head->next;
@@ -227,10 +258,7 @@
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
 				BUG();
-			if (PageLRU(page))
-				BUG();
-			if (PageActive(page))
-				BUG();
+			DEBUG_LRU_PAGE(page);
 			return page;	
 		}
 		curr_order++;
@@ -249,76 +277,83 @@
 }
 #endif
 
-static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
-static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
+/*
+ * If we are able to directly reclaim pages, we move pages from the
+ * inactive_clean list onto the free list until the zone has enough
+ * free pages or until the inactive_clean pages are exhausted.
+ * If we cannot do this work ourselves, call kswapd.
+ */
+void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim));
+void fixup_freespace(zone_t * zone, int direct_reclaim)
+{
+	if (direct_reclaim) {
+		struct page * page;
+		do {
+			if ((page = reclaim_page(zone)))
+				__free_pages_ok(page, 0);
+		} while (page && zone->free_pages <= zone->pages_min);
+	} else
+		wakeup_kswapd(GFP_ATOMIC);
+}
+
+#define PAGES_KERNEL	0
+#define PAGES_MIN	1
+#define PAGES_LOW	2
+#define PAGES_HIGH	3
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+			unsigned long order, int limit, int direct_reclaim)
 {
-	struct page * page = NULL;
-	int __freed = 0;
+	zone_t **zone = zonelist->zones;
+	unsigned long water_mark = 0;
 
-	if (!(gfp_mask & __GFP_WAIT))
-		goto out;
-	if (in_interrupt())
-		BUG();
-
-	current->allocation_order = order;
-	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
-
-	__freed = try_to_free_pages_zone(classzone, gfp_mask);
-
-	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
-
-	if (current->nr_local_pages) {
-		struct list_head * entry, * local_pages;
-		struct page * tmp;
-		int nr_pages;
-
-		local_pages = &current->local_pages;
-
-		if (likely(__freed)) {
-			/* pick from the last inserted so we're lifo */
-			entry = local_pages->next;
-			do {
-				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
-					list_del(entry);
-					current->nr_local_pages--;
-					set_page_count(tmp, 1);
-					page = tmp;
-
-					if (page->buffers)
-						BUG();
-					if (page->mapping)
-						BUG();
-					if (!VALID_PAGE(page))
-						BUG();
-					if (PageLocked(page))
-						BUG();
-					if (PageLRU(page))
-						BUG();
-					if (PageActive(page))
-						BUG();
-					if (PageDirty(page))
-						BUG();
+	for (;;) {
+		zone_t *z = *(zone++);
 
-					break;
-				}
-			} while ((entry = entry->next) != local_pages);
+		if (!z)
+			break;
+		if (!z->size)
+			BUG();
+
+		/*
+		 * We allocate if the number of (free + inactive_clean)
+		 * pages is above the watermark.
+		 */
+		switch (limit) {
+			case PAGES_KERNEL:
+				water_mark = z->pages_min / 2;
+				break;
+			case PAGES_MIN:
+				water_mark = z->pages_min;
+				break;
+			case PAGES_LOW:
+				water_mark = z->pages_low;
+				break;
+			default:
+			case PAGES_HIGH:
+				water_mark = z->pages_high;
 		}
 
-		nr_pages = current->nr_local_pages;
-		/* free in reverse order so that the global order will be lifo */
-		while ((entry = local_pages->prev) != local_pages) {
-			list_del(entry);
-			tmp = list_entry(entry, struct page, list);
-			__free_pages_ok(tmp, tmp->index);
-			if (!nr_pages--)
-				BUG();
+		if (z->free_pages + z->inactive_clean_pages >= water_mark) {
+			struct page *page = NULL;
+			/* If possible, reclaim a page directly. */
+			if (direct_reclaim)
+				page = reclaim_page(z);
+			/* If that fails, fall back to rmqueue. */
+			if (!page)
+				page = rmqueue(z, order);
+			if (page)
+				return page;
 		}
-		current->nr_local_pages = 0;
 	}
- out:
-	*freed = __freed;
-	return page;
+
+	/* Found nothing. */
+	return NULL;
 }
 
 /*
@@ -326,100 +361,262 @@
  */
 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 {
-	unsigned long min;
-	zone_t **zone, * classzone;
+	zone_t **zone;
+	int min, direct_reclaim = 0;
 	struct page * page;
-	int freed;
 
+	/*
+	 * (If anyone calls gfp from interrupts nonatomically then it
+	 * will sooner or later tripped up by a schedule().)
+	 *
+	 * We fall back to lower-level zones if allocation
+	 * in a higher zone fails.
+	 */
+
+	/*
+	 * Can we take pages directly from the inactive_clean
+	 * list?
+	 */
+	if (order == 0 && (gfp_mask & __GFP_WAIT))
+		direct_reclaim = 1;
+
+try_again:
+	/*
+	 * First, see if we have any zones with lots of free memory.
+	 *
+	 * We allocate free memory first because it doesn't contain
+	 * any data we would want to cache.
+	 */
 	zone = zonelist->zones;
-	classzone = *zone;
-	if (classzone == NULL)
+	if (!*zone)
 		return NULL;
 	min = 1UL << order;
 	for (;;) {
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
+		if (!z->size)
+			BUG();
 
-		min += z->pages_low;
+		min += z->pages_min;
 		if (z->free_pages > min) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
-		}
+		} else if (z->free_pages < z->pages_min)
+			fixup_freespace(z, direct_reclaim);
 	}
 
-	classzone->need_balance = 1;
-	mb();
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	/*
+	 * Next, try to allocate a page from a zone with a HIGH
+	 * amount of (free + inactive_clean) pages.
+	 *
+	 * If there is a lot of activity, inactive_target
+	 * will be high and we'll have a good chance of
+	 * finding a page using the HIGH limit.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * Then try to allocate a page from a zone with more
+	 * than zone->pages_low of (free + inactive_clean) pages.
+	 *
+	 * When the working set is very large and VM activity
+	 * is low, we're most likely to have our allocation
+	 * succeed here.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+	if (page)
+		return page;
 
+	/*
+	 * OK, none of the zones on our zonelist has lots
+	 * of pages free.
+	 *
+	 * We wake up kswapd, in the hope that kswapd will
+	 * resolve this situation before memory gets tight.
+	 *
+	 * We'll also help a bit trying to free pages, this
+	 * way statistics will make sure really fast allocators
+	 * are slowed down more than slow allocators and other
+	 * programs in the system shouldn't be impacted as much
+	 * by the hogs.
+	 */
+	wakeup_kswapd(gfp_mask);
+
+	/*
+	 * After waking up kswapd, we try to allocate a page
+	 * from any zone which isn't critical yet.
+	 *
+	 * Kswapd should, in most situations, bring the situation
+	 * back to normal in no time.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * Kernel allocations can eat a few emergency pages.
+	 * We should be able to run without this, find out why
+	 * the SCSI layer isn't happy ...
+	 */
+	if (gfp_mask & __GFP_HIGH) {
+		page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, direct_reclaim);
+		if (page)
+			return page;
+	}
+
+	/*
+	 * Oh well, we didn't succeed.
+	 */
+	if (!(current->flags & PF_MEMALLOC)) {
+		/*
+		 * Are we dealing with a higher order allocation?
+		 *
+		 * If so, try to defragment some memory.
+		 */
+		if (order > 0 && (gfp_mask & __GFP_WAIT))
+			goto defragment;
+
+		/*
+		 * If we arrive here, we are really tight on memory.
+		 * Since kswapd didn't succeed in freeing pages for us,
+		 * we need to help it.
+		 *
+		 * Single page allocs loop until the allocation succeeds.
+		 * Multi-page allocs can fail due to memory fragmentation;
+		 * in that case we bail out to prevent infinite loops and
+		 * hanging device drivers ...
+		 *
+		 * Another issue are GFP_NOFS allocations; because they
+		 * do not have __GFP_FS set it's possible we cannot make
+		 * any progress freeing pages, in that case it's better
+		 * to give up than to deadlock the kernel looping here.
+		 *
+		 * NFS: we must yield the CPU (to rpciod) to avoid deadlock.
+		 */
+		if (gfp_mask & __GFP_WAIT) {
+			yield();
+			if (!order || free_high(ALL_ZONES) >= 0) {
+				int progress = try_to_free_pages(gfp_mask);
+				if (progress || (gfp_mask & __GFP_FS))
+					goto try_again;
+				/*
+				 * Fail if no progress was made and the
+				 * allocation may not be able to block on IO.
+				 */
+				return NULL;
+			}
+		}
+	}
+
+	/*
+	 * Final phase: allocate anything we can!
+	 *
+	 * Higher order allocations, GFP_ATOMIC allocations and
+	 * recursive allocations (PF_MEMALLOC) end up here.
+	 *
+	 * Only recursive allocations can use the very last pages
+	 * in the system, otherwise it would be just too easy to
+	 * deadlock the system...
+	 */
 	zone = zonelist->zones;
 	min = 1UL << order;
 	for (;;) {
-		unsigned long local_min;
 		zone_t *z = *(zone++);
+		struct page * page = NULL;
 		if (!z)
 			break;
 
-		local_min = z->pages_min;
-		if (!(gfp_mask & __GFP_WAIT))
-			local_min >>= 2;
-		min += local_min;
-		if (z->free_pages > min) {
+		/*
+		 * SUBTLE: direct_reclaim is only possible if the task
+		 * becomes PF_MEMALLOC while looping above. This will
+		 * happen when the OOM killer selects this task for
+		 * death.
+		 */
+		if (direct_reclaim) {
+			page = reclaim_page(z);
+			if (page)
+				return page;
+		}
+
+		/* XXX: is pages_min/4 a good amount to reserve for this? */
+		min += z->pages_min / 4;
+		if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
 		}
 	}
+	goto out_failed;
 
-	/* here we're in the low on memory slow path */
 
-rebalance:
-	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+	/*
+	 * Naive "defragmentation" for higher-order allocations. First we
+	 * free the inactive_clean pages to see if we can allocate our
+	 * allocation, then we call page_launder() to clean some dirty
+	 * pages, and last we try once more.
+	 *
+	 * We might want to turn this into something which defragments
+	 * memory based on physical page, simply by looking for unmapped
+	 * pages next to pages on the free list...
+	 */
+defragment:
+	{
+		int try_harder = 0;
+		unsigned int mask = 0;
+		int numpages;
+defragment_again:
 		zone = zonelist->zones;
 		for (;;) {
 			zone_t *z = *(zone++);
 			if (!z)
 				break;
+			if (!z->size)
+				continue;
 
-			page = rmqueue(z, order);
-			if (page)
-				return page;
-		}
-		return NULL;
-	}
-
-	/* Atomic allocations - we can't balance anything */
-	if (!(gfp_mask & __GFP_WAIT))
-		return NULL;
-
-	page = balance_classzone(classzone, gfp_mask, order, &freed);
-	if (page)
-		return page;
+			/*
+			 * Try to free the zone's inactive laundry pages.
+			 * Nonblocking in the first pass; blocking in the
+			 * second pass, but never on very new IO.
+			 */
+			numpages = z->inactive_laundry_pages;
+			if (try_harder) {
+				numpages /= 2;
+				mask = gfp_mask;
+			}
 
-	zone = zonelist->zones;
-	min = 1UL << order;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
+			current->flags |= PF_MEMALLOC;
+			rebalance_laundry_zone(z, numpages, mask);
+			current->flags &= ~PF_MEMALLOC;
+
+			while (z->inactive_clean_pages) {
+				struct page * page;
+				/* Move one page to the free list. */
+				page = reclaim_page(z);
+				if (!page)
+					break;
+				__free_page(page);
+				/* Try if the allocation succeeds. */
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
+		}
 
-		min += z->pages_min;
-		if (z->free_pages > min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+		/* If we can wait for IO to complete, we wait... */
+		if (!try_harder && (gfp_mask & __GFP_FS)) {
+			try_harder = 1;
+			goto defragment_again;
 		}
 	}
 
-	/* Don't let big-order allocations loop */
-	if (order > 3)
-		return NULL;
-
-	/* Yield for kswapd, and try again */
-	yield();
-	goto rebalance;
+out_failed:
+	/* No luck.. */
+//	printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
+	return NULL;
 }
 
 /*
@@ -461,18 +658,29 @@
 }
 
 /*
- * Total amount of free (allocatable) RAM:
+ * These statistics are held in per-zone counters, so we need to loop
+ * over each zone and read the statistics.  We use this silly macro
+ * so we don't need to duplicate the code for every statistic.
+ * If you have a better idea on how to implement this (cut'n'paste
+ * isn't considered better), please let me know - Rik
  */
-unsigned int nr_free_pages (void)
-{
-	unsigned int sum = 0;
-	zone_t *zone;
-
-	for_each_zone(zone)
-		sum += zone->free_pages;
+#define NR_FOO_PAGES(__function_name, __stat)	\
+	unsigned int __function_name (void)	\
+	{					\
+		unsigned int sum = 0;		\
+		zone_t *zone;			\
+						\
+		for_each_zone(zone)		\
+			sum += zone->__stat;	\
+		return sum;			\
+	}
 
-	return sum;
-}
+NR_FOO_PAGES(nr_free_pages, free_pages)
+NR_FOO_PAGES(nr_active_anon_pages, active_anon_pages)
+NR_FOO_PAGES(nr_active_cache_pages, active_cache_pages)
+NR_FOO_PAGES(nr_inactive_dirty_pages, inactive_dirty_pages)
+NR_FOO_PAGES(nr_inactive_laundry_pages, inactive_laundry_pages)
+NR_FOO_PAGES(nr_inactive_clean_pages, inactive_clean_pages)
 
 /*
  * Amount of free RAM allocatable as buffer memory:
@@ -488,10 +696,10 @@
 		zone_t *zone;
 
 		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->size;
-			unsigned long high = zone->pages_high;
-			if (size > high)
-				sum += size - high;
+			sum += zone->free_pages;
+			sum += zone->inactive_clean_pages;
+			sum += zone->inactive_laundry_pages;
+			sum += zone->inactive_dirty_pages;
 		}
 	}
 
@@ -543,10 +751,16 @@
 		tmpdat = tmpdat->node_next;
 	}
 
-	printk("( Active: %d, inactive: %d, free: %d )\n",
-	       nr_active_pages,
-	       nr_inactive_pages,
-	       nr_free_pages());
+	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
+		nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_highpages() << (PAGE_SHIFT-10));
+
+	printk("( Active: %d/%d, inactive_laundry: %d, inactive_clean: %d, free: %d )\n",
+		nr_active_anon_pages() + nr_active_cache_pages(),
+		nr_inactive_dirty_pages(),
+		nr_inactive_laundry_pages(),
+		nr_inactive_clean_pages(),
+		nr_free_pages());
 
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
@@ -681,6 +895,7 @@
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
+extern unsigned int kswapd_minfree;
 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
 	unsigned long *zholes_size, struct page *lmem_map)
@@ -726,8 +941,9 @@
 
 	offset = lmem_map - mem_map;	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
+		int k;
 		zone_t *zone = pgdat->node_zones + j;
-		unsigned long mask;
+		unsigned long mask, extrafree = 0;
 		unsigned long size, realsize;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
@@ -738,10 +954,45 @@
 		printk("zone(%lu): %lu pages.\n", j, size);
 		zone->size = size;
 		zone->name = zone_names[j];
+
+		for (k = 0; k < NR_CPUS; k++) {
+			per_cpu_t *per_cpu = zone->cpu_pages + k;
+
+			INIT_LIST_HEAD(&per_cpu->head);
+			per_cpu->nr_pages = 0;
+			per_cpu->max_nr_pages = realsize / smp_num_cpus / 128;
+			if (per_cpu->max_nr_pages > MAX_PER_CPU_PAGES)
+				per_cpu->max_nr_pages = MAX_PER_CPU_PAGES;
+			else if (!per_cpu->max_nr_pages)
+				per_cpu->max_nr_pages = 1;
+		}
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+		zone->active_anon_pages = 0;
+		zone->active_cache_pages = 0;
+		zone->inactive_clean_pages = 0;
+		zone->inactive_laundry_pages = 0;
+		zone->inactive_dirty_pages = 0;
 		zone->need_balance = 0;
+		zone->need_scan = 0;
+		zone->age_interval = HZ;
+		zone->age_next = jiffies;
+		for (k = 0; k <= MAX_AGE ; k++) {
+			INIT_LIST_HEAD(&zone->active_anon_list[k]);
+			zone->active_anon_count[k] = 0;
+		}
+		for (k = 0; k <= MAX_AGE ; k++) {
+			INIT_LIST_HEAD(&zone->active_cache_list[k]);
+			zone->active_cache_count[k] = 0;
+		}
+		zone->cache_age_bias = 0;
+		zone->anon_age_bias = 0;
+		INIT_LIST_HEAD(&zone->inactive_dirty_list);
+		INIT_LIST_HEAD(&zone->inactive_laundry_list);
+		INIT_LIST_HEAD(&zone->inactive_clean_list);
+		spin_lock_init(&zone->lru_lock);
+
 		if (!size)
 			continue;
 
@@ -761,21 +1012,30 @@
 
 		pgdat->nr_zones = j+1;
 
+		/*
+		 * On large memory machines we keep extra memory
+		 * free for kernel allocations.
+		 */
+		if (zone_extrafree_ratio[j])
+			extrafree = min_t(int, (realtotalpages / zone_extrafree_ratio[j]), zone_extrafree_max[j]);
+		if (extrafree < zone_balance_max[j])
+			extrafree = 0;
+
 		mask = (realsize / zone_balance_ratio[j]);
 		if (mask < zone_balance_min[j])
 			mask = zone_balance_min[j];
-		else if (mask > zone_balance_max[j])
-			mask = zone_balance_max[j];
-		zone->pages_min = mask;
-		zone->pages_low = mask*2;
-		zone->pages_high = mask*3;
-
+		zone->pages_min = extrafree + min(mask, (unsigned long)zone_balance_max[j]);
+		zone->pages_low = extrafree + mask*2;
+		zone->pages_high = extrafree + mask*3;
+		zone->pages_plenty = extrafree + mask*6;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
 
 		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
 			printk("BUG: wrong zone alignment, it will crash\n");
+
+		kswapd_minfree += zone->pages_min;
 
 		/*
 		 * Initially all pages are reserved - free ones are freed
diff -Nru a/mm/rmap.c b/mm/rmap.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/mm/rmap.c	Thu Apr 17 15:25:14 2003
@@ -0,0 +1,545 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
+ * Released under the General Public License (GPL).
+ *
+ *
+ * Simple, low overhead pte-based reverse mapping scheme.
+ * This is kept modular because we may want to experiment
+ * with object-based reverse mapping schemes. Please try
+ * to keep this thing as modular as possible.
+ */
+
+/*
+ * Locking:
+ * - the page->pte_chain is protected by the PG_chainlock bit,
+ *   which nests within the zone lru_lock, then the
+ * - the page->pte.chain is protected by the PG_chainlock bit,
+ *   which nests within the lru lock, then the
+ *   mm->page_table_lock, and then the page lock.
+ * - because swapout locking is opposite to the locking order
+ *   in the page fault path, the swapout path uses trylocks
+ *   on the mm->page_table_lock
+ */
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cache.h>
+
+#include <asm/pgalloc.h>
+#include <asm/rmap.h>
+#include <asm/tlb.h>
+
+/* #define DEBUG_RMAP */
+
+/*
+ * Shared pages have a chain of pte_chain structures, used to locate
+ * all the mappings to this page. We only need a pointer to the pte
+ * here, the page struct for the page table page contains the process
+ * it belongs to and the offset within that process.
+ *
+ * We use an array of pte pointers in this structure to minimise cache
+ * misses while traversing reverse maps.
+ */
+#define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t))
+
+struct pte_chain {
+	struct pte_chain *next;
+	pte_addr_t ptes[NRPTE];
+} ____cacheline_aligned;
+
+static kmem_cache_t	*pte_chain_cache;
+
+/*
+ * pte_chain list management policy:
+ *
+ * - If a page has a pte_chain list then it is shared by at least two
+ *   processes, or by a process which has recently done a fork+exec,
+ *   because a single sharing uses PageDirect.
+ * - The pageout code collapses pte_chains with a single user back into
+ *   PageDirect pointers. This is done lazily so a process can do a number
+ *   of fork+exec sequences without having to allocate and free pte_chains.
+ * - A pte_chain list has free space only in the head member - all succeeding
+ *   members are 100% full.
+ * - If the head element has free space, it occurs in its leading slots.
+ * - All free space in the pte_chain is at the start of the head member.
+ * - Insertion into the pte_chain puts a pte pointer in the last free slot
+ *   of the head member.
+ * - Removal from a pte chain moves the head pte of the head member onto the
+ *   victim pte and frees the head member if it became empty.
+ */
+
+/**
+ * pte_chain_alloc - allocate a pte_chain struct
+ * @gfp_flags: allocation flags
+ *
+ * Returns a pointer to a fresh pte_chain structure. Allocates new
+ * pte_chain structures as required.
+ * Caller needs to hold the page's pte_chain_lock.
+ */
+struct pte_chain * pte_chain_alloc(int gfp_flags)
+{
+	struct pte_chain * pte_chain;
+
+	pte_chain = kmem_cache_alloc(pte_chain_cache, gfp_flags);
+#ifdef DEBUG_RMAP
+	{
+		int i;
+		for (i = 0; i < NRPTE; i++)
+			BUG_ON(pte_chain->ptes[i]);
+		BUG_ON(pte_chain->next);
+	}
+#endif
+	return pte_chain;
+}
+
+/**
+ * __pte_chain_free - free pte_chain structure
+ * @pte_chain: pte_chain struct to free
+ */
+void __pte_chain_free(struct pte_chain *pte_chain)
+{
+	pte_chain->next = NULL;
+	kmem_cache_free(pte_chain_cache, pte_chain);
+}
+
+/**
+ ** VM stuff below this comment
+ **/
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ * @rsslimit: place to put whether the page is over RSS limit
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of processes which referenced the page.
+ * In addition to this it checks if the processes holding the
+ * page are over or under their RSS limit.
+ * Caller needs to hold the pte_chain_lock.
+ *
+ * If the page has a single-entry pte_chain, collapse that back to a
+ * PageDirect representation.  This way, it's only done under memory
+ * pressure, giving a slight speedup to fork+exec for active forkers.
+ */
+int page_referenced(struct page * page, int * rsslimit)
+{
+	int referenced = 0, under_rsslimit = 0;
+	struct mm_struct * mm;
+	struct pte_chain * pc;
+
+	if (PageTestandClearReferenced(page))
+		referenced++;
+
+	if (PageDirect(page)) {
+		pte_t *pte = rmap_ptep_map(page->pte.direct);
+		if (ptep_test_and_clear_young(pte))
+			referenced++;
+
+		mm = ptep_to_mm(pte);
+		if (mm->rss < mm->rlimit_rss)
+			under_rsslimit++;
+		rmap_ptep_unmap(pte);
+	} else {
+		int nr_chains = 0;
+
+		/* Check all the page tables mapping this page. */
+		for (pc = page->pte.chain; pc; pc = pc->next) {
+			int i;
+
+			for (i = NRPTE-1; i >= 0; i--) {
+				pte_addr_t pte_paddr = pc->ptes[i];
+				pte_t *pte;
+
+				if (!pte_paddr)
+					break;
+				pte = rmap_ptep_map(pte_paddr);
+				if (ptep_test_and_clear_young(pte))
+					referenced++;
+				mm = ptep_to_mm(pte);
+				if (mm->rss < mm->rlimit_rss)
+					under_rsslimit++;
+				rmap_ptep_unmap(pte);
+				nr_chains++;
+			}
+		}
+		if (nr_chains == 1) {
+			pc = page->pte.chain;
+			page->pte.direct = pc->ptes[NRPTE-1];
+			SetPageDirect(page);
+			pc->ptes[NRPTE-1] = 0;
+			__pte_chain_free(pc);
+		}
+	}
+
+	/*
+	 * We're only over the RSS limit if all the processes sharing the
+	 * page are.
+	 */
+	*rsslimit = !under_rsslimit;
+
+	return referenced;
+}
+
+/**
+ * page_add_rmap - add reverse mapping entry to a page
+ * @page: the page to add the mapping to
+ * @ptep: the page table entry mapping this page
+ *
+ * Add a new pte reverse mapping to a page.
+ * The caller needs to hold the mm->page_table_lock.
+ */
+struct pte_chain *
+page_add_rmap(struct page * page, pte_t * ptep, struct pte_chain * pte_chain)
+{
+	pte_addr_t pte_paddr = ptep_to_paddr(ptep);
+	struct pte_chain * cur_pte_chain;
+	int i;
+
+#ifdef DEBUG_RMAP
+	if (!page || !ptep)
+		BUG();
+	if (!pte_present(*ptep))
+		BUG();
+	if (!ptep_to_mm(ptep))
+		BUG();
+#endif
+
+	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
+		return pte_chain;
+
+	pte_chain_lock(page);
+
+#ifdef DEBUG_RMAP
+	/*
+	 * This stuff needs help to get up to highmem speed.
+	 */
+	{
+		struct pte_chain * pc;
+		if (PageDirect(page)) {
+			if (page->pte.direct == pte_paddr)
+				BUG();
+		} else {
+			for (pc = page->pte.chain; pc; pc = pc->next) {
+				for (i = 0; i < NRPTE; i++) {
+					pte_addr_t pte = pc->ptes[i];
+
+					if (pte && pte == pte_paddr)
+						BUG();
+				}
+			}
+		}
+	}
+#endif
+
+	if (page->pte.direct == 0) {
+		page->pte.direct = pte_paddr;
+		SetPageDirect(page);
+		goto out;
+	}
+
+	if (PageDirect(page)) {
+		/* Convert a direct pointer into a pte_chain */
+		ClearPageDirect(page);
+		pte_chain->ptes[NRPTE-1] = page->pte.direct;
+		pte_chain->ptes[NRPTE-2] = pte_paddr;
+		page->pte.direct = 0;
+		page->pte.chain = pte_chain;
+		pte_chain = NULL;	/* We consumed it */
+		goto out;
+	}
+
+	cur_pte_chain = page->pte.chain;
+	if (cur_pte_chain->ptes[0]) {	/* It's full */
+		pte_chain->next = cur_pte_chain;
+		page->pte.chain = pte_chain;
+		pte_chain->ptes[NRPTE-1] = pte_paddr;
+		pte_chain = NULL;	/* We consumed it */
+		goto out;
+	}
+
+	BUG_ON(!cur_pte_chain->ptes[NRPTE-1]);
+
+	for (i = NRPTE-2; i >= 0; i--) {
+		if (!cur_pte_chain->ptes[i]) {
+			cur_pte_chain->ptes[i] = pte_paddr;
+			goto out;
+		}
+	}
+	BUG();
+out:
+	pte_chain_unlock(page);
+	return pte_chain;
+}
+
+/**
+ * page_remove_rmap - take down reverse mapping to a page
+ * @page: page to remove mapping from
+ * @ptep: page table entry to remove
+ *
+ * Removes the reverse mapping from the pte_chain of the page,
+ * after that the caller can clear the page table entry and free
+ * the page.
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void page_remove_rmap(struct page * page, pte_t * ptep)
+{
+	pte_addr_t pte_paddr = ptep_to_paddr(ptep);
+	struct pte_chain *pc;
+
+	if (!page || !ptep)
+		BUG();
+	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
+		return;
+	if (!page_mapped(page))
+		return;		/* remap_page_range() from a driver? */
+
+	pte_chain_lock(page);
+
+	if (PageDirect(page)) {
+		if (page->pte.direct == pte_paddr) {
+			page->pte.direct = 0;
+			ClearPageDirect(page);
+			goto out;
+		}
+	} else {
+		struct pte_chain *start = page->pte.chain;
+		int victim_i = -1;
+
+		for (pc = start; pc; pc = pc->next) {
+			int i;
+
+			if (pc->next)
+				prefetch(pc->next);
+			for (i = 0; i < NRPTE; i++) {
+				pte_addr_t pa = pc->ptes[i];
+
+				if (!pa)
+					continue;
+				if (victim_i == -1)
+					victim_i = i;
+				if (pa != pte_paddr)
+					continue;
+				pc->ptes[i] = start->ptes[victim_i];
+				start->ptes[victim_i] = 0;
+				if (victim_i == NRPTE-1) {
+					/* Emptied a pte_chain */
+					page->pte.chain = start->next;
+					__pte_chain_free(start);
+				}
+				goto out;
+			}
+		}
+	}
+#ifdef DEBUG_RMAP
+	/* Not found. This should NEVER happen! */
+	printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
+	printk(KERN_ERR "page_remove_rmap: only found: ");
+	if (PageDirect(page)) {
+		printk("%llx", (u64)page->pte.direct);
+	} else {
+		for (pc = page->pte.chain; pc; pc = pc->next) {
+			int i;
+			for (i = 0; i < NRPTE; i++)
+				printk(" %d:%llx", i, (u64)pc->ptes[i]);
+		}
+	}
+	printk("\n");
+	printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
+#endif
+
+out:
+	pte_chain_unlock(page);
+	return;
+}
+
+/**
+ * try_to_unmap_one - worker function for try_to_unmap
+ * @page: page to unmap
+ * @ptep: page table entry to unmap from page
+ *
+ * Internal helper function for try_to_unmap, called for each page
+ * table entry mapping a page. Because locking order here is opposite
+ * to the locking order used by the page fault path, we use trylocks.
+ * Locking:
+ *	   lru lock			page_launder()
+ *	    page lock			page_launder(), trylock
+ *		pte_chain_lock		page_launder()
+ *		    mm->page_table_lock	try_to_unmap_one(), trylock
+ */
+static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
+static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
+{
+	pte_t *ptep = rmap_ptep_map(paddr);
+	unsigned long address = ptep_to_address(ptep);
+	struct mm_struct * mm = ptep_to_mm(ptep);
+	struct vm_area_struct * vma;
+	pte_t pte;
+	int ret;
+
+	if (!mm)
+		BUG();
+
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	if (!spin_trylock(&mm->page_table_lock)) {
+		rmap_ptep_unmap(ptep);
+		return SWAP_AGAIN;
+	}
+
+
+	/* During mremap, it's possible pages are not in a VMA. */
+	vma = find_vma(mm, address);
+	if (!vma) {
+		ret = SWAP_FAIL;
+		goto out_unlock;
+	}
+
+	/* The page is mlock()d, we cannot swap it out. */
+	if (vma->vm_flags & VM_LOCKED) {
+		ret = SWAP_FAIL;
+		goto out_unlock;
+	}
+
+	/* Nuke the page table entry. */
+	pte = ptep_get_and_clear(ptep);
+	flush_tlb_page(vma, address);
+	flush_cache_page(vma, address);
+
+	/* Store the swap location in the pte. See handle_pte_fault() ... */
+	if (PageSwapCache(page)) {
+		swp_entry_t entry = { .val = page->index };
+		swap_duplicate(entry);
+		set_pte(ptep, swp_entry_to_pte(entry));
+	}
+
+	/* Move the dirty bit to the physical page now the pte is gone. */
+	if (pte_dirty(pte))
+		set_page_dirty(page);
+
+	mm->rss--;
+	page_cache_release(page);
+	ret = SWAP_SUCCESS;
+
+out_unlock:
+	rmap_ptep_unmap(ptep);
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path.  Caller must hold the zone lru lock
+ * and the page lock.  Return values are:
+ *
+ * SWAP_SUCCESS	- we succeeded in removing all mappings
+ * SWAP_AGAIN	- we missed a trylock, try again later
+ * SWAP_FAIL	- the page is unswappable
+ * SWAP_ERROR	- an error occurred
+ */
+int try_to_unmap(struct page * page)
+{
+	struct pte_chain *pc, *next_pc, *start;
+	int ret = SWAP_SUCCESS;
+	int victim_i = -1;
+
+	/* This page should not be on the pageout lists. */
+	if (PageReserved(page))
+		BUG();
+	if (!PageLocked(page))
+		BUG();
+	/* We need backing store to swap out a page. */
+	if (!page->mapping)
+		BUG();
+
+	if (PageDirect(page)) {
+		ret = try_to_unmap_one(page, page->pte.direct);
+		if (ret == SWAP_SUCCESS) {
+			page->pte.direct = 0;
+			ClearPageDirect(page);
+		}
+		goto out;
+	}		
+
+	start = page->pte.chain;
+	for (pc = start; pc; pc = next_pc) {
+		int i;
+
+		next_pc = pc->next;
+		if (next_pc)
+			prefetch(next_pc);
+		for (i = 0; i < NRPTE; i++) {
+			pte_addr_t pte_paddr = pc->ptes[i];
+
+			if (!pte_paddr)
+				continue;
+			if (victim_i == -1) 
+				victim_i = i;
+
+			switch (try_to_unmap_one(page, pte_paddr)) {
+			case SWAP_SUCCESS:
+				/*
+				 * Release a slot.  If we're releasing the
+				 * first pte in the first pte_chain then
+				 * pc->ptes[i] and start->ptes[victim_i] both
+				 * refer to the same thing.  It works out.
+				 */
+				pc->ptes[i] = start->ptes[victim_i];
+				start->ptes[victim_i] = 0;
+				victim_i++;
+				if (victim_i == NRPTE) {
+					page->pte.chain = start->next;
+					__pte_chain_free(start);
+					start = page->pte.chain;
+					victim_i = 0;
+				}
+				break;
+			case SWAP_AGAIN:
+				/* Skip this pte, remembering status. */
+				ret = SWAP_AGAIN;
+				continue;
+			case SWAP_FAIL:
+				ret = SWAP_FAIL;
+				goto out;
+			case SWAP_ERROR:
+				ret = SWAP_ERROR;
+				goto out;
+			}
+		}
+	}
+out:
+	return ret;
+}
+
+/**
+ ** No more VM stuff below this comment, only pte_chain helper
+ ** functions.
+ **/
+
+static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct pte_chain *pc = p;
+
+	memset(pc, 0, sizeof(*pc));
+}
+
+void __init pte_chain_init(void)
+{
+	pte_chain_cache = kmem_cache_create(	"pte_chain",
+						sizeof(struct pte_chain),
+						0,
+						0,
+						pte_chain_ctor,
+						NULL);
+
+	if (!pte_chain_cache)
+		panic("failed to create pte_chain cache!\n");
+}
diff -Nru a/mm/swap.c b/mm/swap.c
--- a/mm/swap.c	Thu Apr 17 15:25:14 2003
+++ b/mm/swap.c	Thu Apr 17 15:25:14 2003
@@ -15,10 +15,10 @@
 
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
-#include <linux/swap.h>
 #include <linux/swapctl.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/mm_inline.h>
 
 #include <asm/dma.h>
 #include <asm/uaccess.h> /* for copy_to/from_user */
@@ -33,22 +33,148 @@
 	8,	/* do swap I/O in clusters of this size */
 };
 
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void deactivate_page_nolock(struct page * page)
+{
+	/*
+	 * Don't touch it if it's not on the active list.
+	 * (some pages aren't on any list at all)
+	 */
+	ClearPageReferenced(page);
+	if (PageActiveAnon(page)) {
+		del_page_from_active_anon_list(page);
+		add_page_to_inactive_dirty_list(page);
+	} else if (PageActiveCache(page)) {
+		del_page_from_active_cache_list(page);
+		add_page_to_inactive_dirty_list(page);
+	}
+}	
+
+void deactivate_page(struct page * page)
+{
+	lru_lock(page_zone(page));
+	deactivate_page_nolock(page);
+	lru_unlock(page_zone(page));
+}
+
+/**
+ * drop_page - like deactivate_page, but try inactive_clean list
+ * @page: the page to drop
+ *
+ * Try to move a page to the inactive_clean list, this succeeds if the
+ * page is clean and not in use by anybody. If the page cannot be placed
+ * on the inactive_clean list it is placed on the inactive_dirty list
+ * instead.
+ *
+ * Note: this function gets called with the lru lock held.
+ */
+void drop_page_zone(struct zone_struct *zone, struct page * page)
+{
+	if (!TryLockPage(page)) {
+		if (page->mapping && page->buffers) {
+			page_cache_get(page);
+			lru_unlock(zone);
+			try_to_release_page(page, GFP_NOIO);
+			page_cache_release(page);
+			lru_lock(zone);
+		}
+		UnlockPage(page);
+	}
+
+	/* Make sure the page really is reclaimable. */
+	pte_chain_lock(page);
+	if (!page->mapping || PageDirty(page) || page->pte.direct ||
+			page->buffers || page_count(page) > 1)
+		deactivate_page_nolock(page);
+
+	else if (page_count(page) == 1) {
+		ClearPageReferenced(page);
+		if (PageActiveAnon(page)) {
+			del_page_from_active_anon_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageActiveCache(page)) {
+			del_page_from_active_cache_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageInactiveDirty(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageInactiveLaundry(page)) {
+			del_page_from_inactive_laundry_list(page);
+			add_page_to_inactive_clean_list(page);
+		}
+	}
+	pte_chain_unlock(page);
+}
+
+void drop_page(struct page * page)
+{
+	if (!TryLockPage(page)) {
+		if (page->mapping && page->buffers) {
+			page_cache_get(page);
+			lru_unlock(ALL_ZONES);
+			try_to_release_page(page, GFP_NOIO);
+			page_cache_release(page);
+			lru_lock(ALL_ZONES);
+		}
+		UnlockPage(page);
+	}
+
+	/* Make sure the page really is reclaimable. */
+	pte_chain_lock(page);
+	if (!page->mapping || PageDirty(page) || page->pte.direct ||
+			page->buffers || page_count(page) > 1)
+		deactivate_page_nolock(page);
+
+	else if (page_count(page) == 1) {
+		ClearPageReferenced(page);
+		if (PageActiveAnon(page)) {
+			del_page_from_active_anon_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageActiveCache(page)) {
+			del_page_from_active_cache_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageInactiveDirty(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageInactiveLaundry(page)) {
+			del_page_from_inactive_laundry_list(page);
+			add_page_to_inactive_clean_list(page);
+		}
+	}
+	pte_chain_unlock(page);
+}
+
 /*
  * Move an inactive page to the active list.
  */
-static inline void activate_page_nolock(struct page * page)
+void activate_page_nolock(struct page * page)
 {
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(page);
-		add_page_to_active_list(page);
+	if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
+	} else if (PageInactiveLaundry(page)) {
+		del_page_from_inactive_laundry_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
 	}
 }
 
 void activate_page(struct page * page)
 {
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(page_zone(page));
 	activate_page_nolock(page);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(page_zone(page));
 }
 
 /**
@@ -58,10 +184,10 @@
 void lru_cache_add(struct page * page)
 {
 	if (!PageLRU(page)) {
-		spin_lock(&pagemap_lru_lock);
-		if (!TestSetPageLRU(page))
-			add_page_to_inactive_list(page);
-		spin_unlock(&pagemap_lru_lock);
+		lru_lock(page_zone(page));
+		if (!TestandSetPageLRU(page))
+			add_page_to_active_list(page, INITIAL_AGE);
+		lru_unlock(page_zone(page));
 	}
 }
 
@@ -70,17 +196,22 @@
  * @page: the page to add
  *
  * This function is for when the caller already holds
- * the pagemap_lru_lock.
+ * the lru lock.
  */
 void __lru_cache_del(struct page * page)
 {
-	if (TestClearPageLRU(page)) {
-		if (PageActive(page)) {
-			del_page_from_active_list(page);
-		} else {
-			del_page_from_inactive_list(page);
-		}
+	if (PageActiveAnon(page)) {
+		del_page_from_active_anon_list(page);
+	} else if (PageActiveCache(page)) {
+		del_page_from_active_cache_list(page);
+	} else if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+	} else if (PageInactiveLaundry(page)) {
+		del_page_from_inactive_laundry_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
 	}
+	ClearPageLRU(page);
 }
 
 /**
@@ -89,9 +220,9 @@
  */
 void lru_cache_del(struct page * page)
 {
-	spin_lock(&pagemap_lru_lock);
+	lru_lock(page_zone(page));
 	__lru_cache_del(page);
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(page_zone(page));
 }
 
 /*
diff -Nru a/mm/swap_state.c b/mm/swap_state.c
--- a/mm/swap_state.c	Thu Apr 17 15:25:14 2003
+++ b/mm/swap_state.c	Thu Apr 17 15:25:14 2003
@@ -89,6 +89,40 @@
 	return 0;
 }
 
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache.  Caller needs to hold the page lock.
+ */
+int add_to_swap(struct page * page)
+{
+	swp_entry_t entry;
+
+	if (!PageLocked(page))
+		BUG();
+
+	for (;;) {
+		entry = get_swap_page();
+		if (!entry.val)
+			return 0;
+		/*
+		 * Add it to the swap cache and mark it dirty
+		 * (adding to the page cache will clear the dirty
+		 * and uptodate bits, so we need to do it again)
+		 */
+		if (add_to_swap_cache(page, entry) == 0) {
+			SetPageUptodate(page);
+			set_page_dirty(page);
+			swap_free(entry);
+			return 1;
+		}
+		/* Raced with "speculative" read_swap_cache_async */
+		swap_free(entry);
+	}
+}
+
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
diff -Nru a/mm/swapfile.c b/mm/swapfile.c
--- a/mm/swapfile.c	Thu Apr 17 15:25:14 2003
+++ b/mm/swapfile.c	Thu Apr 17 15:25:14 2003
@@ -362,8 +362,9 @@
  * what to do if a write is requested later.
  */
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
-	pte_t *dir, swp_entry_t entry, struct page* page)
+static void
+unuse_pte(struct vm_area_struct * vma, unsigned long address, pte_t * dir,
+       swp_entry_t entry, struct page * page, struct pte_chain ** pte_chainp)
 {
 	pte_t pte = *dir;
 
@@ -373,6 +374,7 @@
 		return;
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
 	swap_free(entry);
 	++vma->vm_mm->rss;
 }
@@ -382,7 +384,8 @@
 	unsigned long address, unsigned long size, unsigned long offset,
 	swp_entry_t entry, struct page* page)
 {
-	pte_t * pte;
+	struct pte_chain * pte_chain = NULL;
+	pte_t *pte, *mapping;
 	unsigned long end;
 
 	if (pmd_none(*dir))
@@ -392,17 +395,25 @@
 		pmd_clear(dir);
 		return;
 	}
-	pte = pte_offset(dir, address);
+	mapping = pte = pte_offset_map(dir, address);
 	offset += address & PMD_MASK;
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
+		/*
+		 * FIXME: handle pte_chain_alloc() failures
+		 */
+		if (pte_chain == NULL)
+			pte_chain = pte_chain_alloc(GFP_ATOMIC);
+		unuse_pte(vma, offset+address-vma->vm_start,
+			       pte, entry, page, &pte_chain);
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	pte_unmap(mapping);
+	pte_chain_free(pte_chain);
 }
 
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
diff -Nru a/mm/vmalloc.c b/mm/vmalloc.c
--- a/mm/vmalloc.c	Thu Apr 17 15:25:14 2003
+++ b/mm/vmalloc.c	Thu Apr 17 15:25:14 2003
@@ -31,7 +31,7 @@
 		pmd_clear(pmd);
 		return;
 	}
-	pte = pte_offset(pmd, address);
+	pte = pte_offset_kernel(pmd, address);
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -126,7 +126,7 @@
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		if (alloc_area_pte(pte, address, end - address, gfp_mask, prot))
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c	Thu Apr 17 15:25:14 2003
+++ b/mm/vmscan.c	Thu Apr 17 15:25:14 2003
@@ -12,6 +12,7 @@
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
+ *  O(1) rmap vm, Arjan van de ven <arjanv@redhat.com>
  */
 
 #include <linux/slab.h>
@@ -23,9 +24,12 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
+#include <linux/mm_inline.h>
 
 #include <asm/pgalloc.h>
 
+static void refill_freelist(void);
+static void wakeup_memwaiters(void);
 /*
  * The "priority" of VM scanning is how much of the queues we
  * will scan in one go. A value of 6 for DEF_PRIORITY implies
@@ -34,674 +38,926 @@
  */
 #define DEF_PRIORITY (6)
 
-/*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
- * It returns zero if it couldn't do anything,
- *
- * rss may decrease because pages are shared, but this
- * doesn't count as having freed a page.
- */
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
+static inline void age_page_up_nolock(struct page *page, int old_age)
 {
-	pte_t pte;
-	swp_entry_t entry;
+	int new_age;
+	
+	new_age = old_age+4;
+	if (new_age < 0)
+		new_age = 0;
+	if (new_age > MAX_AGE)
+		new_age = MAX_AGE;	
+		
+	if (PageActiveAnon(page)) {
+		del_page_from_active_anon_list(page);
+		add_page_to_active_anon_list(page, new_age);	
+	} else if (PageActiveCache(page)) {
+		del_page_from_active_cache_list(page);
+		add_page_to_active_cache_list(page, new_age);	
+	} else if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+		add_page_to_active_list(page, new_age);	
+	} else if (PageInactiveLaundry(page)) {
+		del_page_from_inactive_laundry_list(page);
+		add_page_to_active_list(page, new_age);	
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
+		add_page_to_active_list(page, new_age);	
+	} else return;
 
-	/* Don't look at this pte if it's been accessed recently. */
-	if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
-		mark_page_accessed(page);
-		return 0;
-	}
+}
 
-	/* Don't bother unmapping pages that are active */
-	if (PageActive(page))
-		return 0;
 
-	/* Don't bother replenishing zones not under pressure.. */
-	if (!memclass(page_zone(page), classzone))
-		return 0;
+/* Must be called with page's pte_chain_lock held. */
+static inline int page_mapping_inuse(struct page * page)
+{
+	struct address_space * mapping = page->mapping;
+
+	/* Page is in somebody's page tables. */
+	if (page->pte.direct)
+		return 1;
 
-	if (TryLockPage(page))
+	/* XXX: does this happen ? */
+	if (!mapping)
 		return 0;
 
-	/* From this point on, the odds are that we're going to
-	 * nuke this pte, so read and clear the pte.  This hook
-	 * is needed on CPUs which update the accessed and dirty
-	 * bits in hardware.
-	 */
-	flush_cache_page(vma, address);
-	pte = ptep_get_and_clear(page_table);
-	flush_tlb_page(vma, address);
-
-	if (pte_dirty(pte))
-		set_page_dirty(page);
-
-	/*
-	 * Is the page already in the swap cache? If so, then
-	 * we can just drop our reference to it without doing
-	 * any IO - it's already up-to-date on disk.
-	 */
-	if (PageSwapCache(page)) {
-		entry.val = page->index;
-		swap_duplicate(entry);
-set_swap_pte:
-		set_pte(page_table, swp_entry_to_pte(entry));
-drop_pte:
-		mm->rss--;
-		UnlockPage(page);
-		{
-			int freeable = page_count(page) - !!page->buffers <= 2;
-			page_cache_release(page);
-			return freeable;
-		}
-	}
+	/* File is mmaped by somebody. */
+	if (mapping->i_mmap || mapping->i_mmap_shared)
+		return 1;
 
-	/*
-	 * Is it a clean page? Then it must be recoverable
-	 * by just paging it in again, and we can just drop
-	 * it..  or if it's dirty but has backing store,
-	 * just mark the page dirty and drop it.
-	 *
-	 * However, this won't actually free any real
-	 * memory, as the page will just be in the page cache
-	 * somewhere, and as such we should just continue
-	 * our scan.
-	 *
-	 * Basically, this just makes it possible for us to do
-	 * some real work in the future in "refill_inactive()".
-	 */
-	if (page->mapping)
-		goto drop_pte;
-	if (!PageDirty(page))
-		goto drop_pte;
+	return 0;
+}
 
-	/*
-	 * Anonymous buffercache pages can be left behind by
-	 * concurrent truncate and pagefault.
-	 */
-	if (page->buffers)
-		goto preserve;
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
+ */
+struct page * reclaim_page(zone_t * zone)
+{
+	struct page * page = NULL;
+	struct list_head * page_lru;
+	swp_entry_t entry = {0};
+	int maxscan;
 
 	/*
-	 * This is a dirty, swappable page.  First of all,
-	 * get a suitable swap entry for it, and make sure
-	 * we have the swap cache set up to associate the
-	 * page with that swap entry.
+	 * We need to hold the pagecache_lock around all tests to make sure
+	 * reclaim_page() doesn't race with other pagecache users
 	 */
-	for (;;) {
-		entry = get_swap_page();
-		if (!entry.val)
-			break;
-		/* Add it to the swap cache and mark it dirty
-		 * (adding to the page cache will clear the dirty
-		 * and uptodate bits, so we need to do it again)
-		 */
-		if (add_to_swap_cache(page, entry) == 0) {
-			SetPageUptodate(page);
-			set_page_dirty(page);
-			goto set_swap_pte;
+	lru_lock(zone);
+	spin_lock(&pagecache_lock);
+	maxscan = zone->inactive_clean_pages;
+	while (maxscan-- && !list_empty(&zone->inactive_clean_list)) {
+		page_lru = zone->inactive_clean_list.prev;
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		BUG_ON(unlikely(!PageInactiveClean(page)));
+
+		/* Page is being freed */
+		if (unlikely(page_count(page)) == 0) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->inactive_clean_list);
+			continue;
 		}
-		/* Raced with "speculative" read_swap_cache_async */
-		swap_free(entry);
-	}
-
-	/* No swap space left */
-preserve:
-	set_pte(page_table, pte);
-	UnlockPage(page);
-	return 0;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
-{
-	pte_t * pte;
-	unsigned long pmd_end;
 
-	if (pmd_none(*dir))
-		return count;
-	if (pmd_bad(*dir)) {
-		pmd_ERROR(*dir);
-		pmd_clear(dir);
-		return count;
-	}
-	
-	pte = pte_offset(dir, address);
-	
-	pmd_end = (address + PMD_SIZE) & PMD_MASK;
-	if (end > pmd_end)
-		end = pmd_end;
+		/* Page cannot be reclaimed ?  Move to inactive_dirty list. */
+		pte_chain_lock(page);
+		if (unlikely(page->pte.direct || page->buffers ||
+				PageReferenced(page) || PageDirty(page) ||
+				page_count(page) > 1 || TryLockPage(page))) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_inactive_dirty_list(page);
+			pte_chain_unlock(page);
+			continue;
+		}
 
-	do {
-		if (pte_present(*pte)) {
-			struct page *page = pte_page(*pte);
+		/*
+		 * From here until reaching either the bottom of the loop
+		 * or found_page: the pte_chain_lock is held.
+		 */
 
-			if (VALID_PAGE(page) && !PageReserved(page)) {
-				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
-				if (!count) {
-					address += PAGE_SIZE;
-					break;
-				}
-			}
+		/* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+			entry.val = page->index;
+			__delete_from_swap_cache(page);
+			goto found_page;
 		}
-		address += PAGE_SIZE;
-		pte++;
-	} while (address && (address < end));
-	mm->swap_address = address;
-	return count;
-}
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
-{
-	pmd_t * pmd;
-	unsigned long pgd_end;
+		if (page->mapping) {
+			__remove_inode_page(page);
+			goto found_page;
+		}
 
-	if (pgd_none(*dir))
-		return count;
-	if (pgd_bad(*dir)) {
-		pgd_ERROR(*dir);
-		pgd_clear(dir);
-		return count;
+		/* We should never ever get here. */
+		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+		list_del(page_lru);
+		zone->inactive_clean_pages--;
+		pte_chain_unlock(page);
+		UnlockPage(page);
 	}
+	spin_unlock(&pagecache_lock);
+	lru_unlock(zone);
+	return NULL;
 
-	pmd = pmd_offset(dir, address);
 
-	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;	
-	if (pgd_end && (end > pgd_end))
-		end = pgd_end;
-	
-	do {
-		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
-		if (!count)
-			break;
-		address = (address + PMD_SIZE) & PMD_MASK;
-		pmd++;
-	} while (address && (address < end));
-	return count;
+found_page:
+	__lru_cache_del(page);
+	pte_chain_unlock(page);
+	spin_unlock(&pagecache_lock);
+	lru_unlock(zone);
+	if (entry.val)
+		swap_free(entry);
+	UnlockPage(page);
+	if (page_count(page) != 1)
+		printk("VM: reclaim_page, found page with count %d!\n",
+				page_count(page));
+	return page;
 }
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
+/**
+ * need_rebalance_dirty - do we need to write inactive stuff to disk?
+ * @zone: the zone in question
+ *
+ * Returns true if the zone in question has an inbalance between inactive
+ * dirty on one side and inactive laundry + inactive clean on the other
+ * Right now set the balance at 50%; may need tuning later on
+ */
+static inline int need_rebalance_dirty(zone_t * zone)
 {
-	pgd_t *pgdir;
-	unsigned long end;
-
-	/* Don't swap out areas which are reserved */
-	if (vma->vm_flags & VM_RESERVED)
-		return count;
+	if (zone->inactive_dirty_pages > zone->inactive_laundry_pages + zone->inactive_clean_pages)
+		return 1;
 
-	pgdir = pgd_offset(mm, address);
-
-	end = vma->vm_end;
-	BUG_ON(address >= end);
-	do {
-		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
-		if (!count)
-			break;
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-		pgdir++;
-	} while (address && (address < end));
-	return count;
+	return 0;
 }
 
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
+/**
+ * need_rebalance_laundry - does the zone have too few inactive_clean pages?
+ * @zone: the zone in question
+ *
+ * Returns true if the zone in question has too few pages in inactive clean
+ * + free
+ */
+static inline int need_rebalance_laundry(zone_t * zone)
+{
+	if (free_low(zone) >= 0)
+		return 1;
+	return 0;
+}
 
-/*
- * Returns remaining count of pages to be swapped out by followup call.
+/**
+ * launder_page - clean dirty page, move to inactive_laundry list
+ * @zone: zone to free pages in
+ * @gfp_mask: what operations we are allowed to do
+ * @page: the page at hand, must be on the inactive dirty list
+ *
+ * per-zone lru lock is assumed to be held, but this function can drop
+ * it and sleep, so no other locks are allowed to be held.
+ *
+ * returns 0 for failure; 1 for success
  */
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
+int launder_page(zone_t * zone, int gfp_mask, struct page *page)
 {
-	unsigned long address;
-	struct vm_area_struct* vma;
+	int over_rsslimit;
+
+	/*
+	 * Page is being freed, don't worry about it, but report progress.
+	 */
+	if (unlikely(page_count(page)) == 0)
+		return 1;
 
+	BUG_ON(!PageInactiveDirty(page));
+	del_page_from_inactive_dirty_list(page);
+	add_page_to_inactive_laundry_list(page);
+	/* store the time we start IO */
+	page->age = (jiffies/HZ)&255;
 	/*
-	 * Find the proper vm-area after freezing the vma chain 
-	 * and ptes.
+	 * The page is locked. IO in progress?
+	 * If so, move to laundry and report progress
+	 * Acquire PG_locked early in order to safely
+	 * access page->mapping.
 	 */
-	spin_lock(&mm->page_table_lock);
-	address = mm->swap_address;
-	if (address == TASK_SIZE || swap_mm != mm) {
-		/* We raced: don't count this mm but try again */
-		++*mmcounter;
-		goto out_unlock;
+	if (unlikely(TryLockPage(page))) {
+		return 1;
 	}
-	vma = find_vma(mm, address);
-	if (vma) {
-		if (address < vma->vm_start)
-			address = vma->vm_start;
 
-		for (;;) {
-			count = swap_out_vma(mm, vma, address, count, classzone);
-			vma = vma->vm_next;
-			if (!vma)
-				break;
-			if (!count)
-				goto out_unlock;
-			address = vma->vm_start;
+	/*
+	 * The page is in active use or really unfreeable. Move to
+	 * the active list and adjust the page age if needed.
+	 */
+	pte_chain_lock(page);
+	if (page_referenced(page, &over_rsslimit) && !over_rsslimit &&
+			page_mapping_inuse(page)) {
+		del_page_from_inactive_laundry_list(page);
+		add_page_to_active_list(page, INITIAL_AGE);
+		pte_chain_unlock(page);
+		UnlockPage(page);
+		return 1;
+	}
+
+	/*
+	 * Anonymous process memory without backing store. Try to
+	 * allocate it some swap space here.
+	 *
+	 * XXX: implement swap clustering ?
+	 */
+	if (page->pte.direct && !page->mapping && !page->buffers) {
+		page_cache_get(page);
+		pte_chain_unlock(page);
+		lru_unlock(zone);
+		if (!add_to_swap(page)) {
+			activate_page(page);
+			UnlockPage(page);
+			page_cache_release(page);
+			lru_lock(zone);
+			return 0;
 		}
+		page_cache_release(page);
+		lru_lock(zone);
+		/* Note: may be on another list ! */
+		if (!PageInactiveLaundry(page)) {
+			UnlockPage(page);
+			return 1;
+		}
+		if (unlikely(page_count(page)) == 0) {
+			UnlockPage(page);
+			return 1;
+		}
+		pte_chain_lock(page);
 	}
-	/* Indicate that we reached the end of address space */
-	mm->swap_address = TASK_SIZE;
 
-out_unlock:
-	spin_unlock(&mm->page_table_lock);
-	return count;
-}
+	/*
+	 * The page is mapped into the page tables of one or more
+	 * processes. Try to unmap it here.
+	 */
+	if (page->pte.direct && page->mapping) {
+		switch (try_to_unmap(page)) {
+			case SWAP_ERROR:
+			case SWAP_FAIL:
+				goto page_active;
+			case SWAP_AGAIN:
+				pte_chain_unlock(page);
+				UnlockPage(page);
+				lru_unlock(zone);
+				cpu_relax();
+				lru_lock(zone);
+				return 0;
+			case SWAP_SUCCESS:
+				; /* fall through, try freeing the page below */
+			/* fixme: add a SWAP_MLOCK case */
+		}
+	}
+	pte_chain_unlock(page);
 
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
-{
-	int counter, nr_pages = SWAP_CLUSTER_MAX;
-	struct mm_struct *mm;
+	if (PageDirty(page) && page->mapping) {
+		/*
+		 * The page can be dirtied after we start writing, but
+		 * in that case the dirty bit will simply be set again
+		 * and we'll need to write it again.
+		 */
+		int (*writepage)(struct page *);
 
-	counter = mmlist_nr;
-	do {
-		if (unlikely(current->need_resched)) {
-			__set_current_state(TASK_RUNNING);
-			schedule();
-		}
+		writepage = page->mapping->a_ops->writepage;
+		if ((gfp_mask & __GFP_FS) && writepage) {
+			ClearPageDirty(page);
+			SetPageLaunder(page);
+			page_cache_get(page);
+			lru_unlock(zone);
+
+			writepage(page);
 
-		spin_lock(&mmlist_lock);
-		mm = swap_mm;
-		while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
-			mm->swap_address = 0;
-			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
-			if (mm == swap_mm)
-				goto empty;
-			swap_mm = mm;
+			page_cache_release(page);
+			lru_lock(zone);
+			return 1;
+		} else {
+			del_page_from_inactive_laundry_list(page);
+			add_page_to_inactive_dirty_list(page);
+			/* FIXME: this is wrong for !__GFP_FS !!! */
+			UnlockPage(page);
+			return 0;
 		}
+	}
 
-		/* Make sure the mm doesn't disappear when we drop the lock.. */
-		atomic_inc(&mm->mm_users);
-		spin_unlock(&mmlist_lock);
+	/*
+	 * If the page has buffers, try to free the buffer mappings
+	 * associated with this page. If we succeed we try to free
+	 * the page as well.
+	 */
+	if (page->buffers) {
+		/* To avoid freeing our page before we're done. */
+		page_cache_get(page);
+		lru_unlock(zone);
 
-		nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
+		try_to_release_page(page, gfp_mask);
+		UnlockPage(page);
 
-		mmput(mm);
+		/* 
+		 * If the buffers were the last user of the page we free
+		 * the page here. Because of that we shouldn't hold the
+		 * lru lock yet.
+		 */
+		page_cache_release(page);
 
-		if (!nr_pages)
-			return 1;
-	} while (--counter >= 0);
+		lru_lock(zone);
+		return 1;
+	}
 
+	/*
+	 * If the page is really freeable now, move it to the
+	 * inactive_laundry list to keep LRU order.
+	 *
+	 * We re-test everything since the page could have been
+	 * used by somebody else while we waited on IO above.
+	 * This test is not safe from races; only the one in
+	 * reclaim_page() needs to be.
+	 */
+	pte_chain_lock(page);
+	if (page->mapping && !PageDirty(page) && !page->pte.direct &&
+			page_count(page) == 1) {
+		pte_chain_unlock(page);
+		UnlockPage(page);
+		return 1;
+	} else {
+		/*
+		 * OK, we don't know what to do with the page.
+		 * It's no use keeping it here, so we move it
+		 * back to the active list.
+		 */
+ page_active:
+		activate_page_nolock(page);
+		pte_chain_unlock(page);
+		UnlockPage(page);
+	}
 	return 0;
+}
 
-empty:
-	spin_unlock(&mmlist_lock);
-	return 0;
+/*
+ * The aging interval varies from fast to really slow, it is
+ * important that we never age too fast and desirable that we
+ * keep the pages sorted in order for eviction.
+ *
+ * Note that while most of the time kscand's recalculating of
+ * the per zone aging interval should be good enough, we want
+ * the ability to do "emergency wakeups" here since memory zones
+ * can suddenly come under VM pressure.
+ */
+#define MAX_AGING_INTERVAL ((unsigned long)300*HZ)
+#define MIN_AGING_INTERVAL ((unsigned long)HZ/2)
+static void speedup_aging(struct zone_struct * zone)
+{
+	zone->need_scan++;
+	if (zone->need_scan > 3) {
+		unsigned long next_wakeup = jiffies + MIN_AGING_INTERVAL;
+		if (time_before(next_wakeup, zone->age_next))
+			zone->age_next = next_wakeup;
+	}
 }
 
-static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
-static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
+/* Ages down all pages on the active list */
+/* assumes the lru lock held */
+static inline void kachunk_anon(struct zone_struct * zone)
 {
-	struct list_head * entry;
-	int max_scan = nr_inactive_pages / priority;
-	int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
-
-	spin_lock(&pagemap_lru_lock);
-	while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
-		struct page * page;
-
-		if (unlikely(current->need_resched)) {
-			spin_unlock(&pagemap_lru_lock);
-			__set_current_state(TASK_RUNNING);
-			schedule();
-			spin_lock(&pagemap_lru_lock);
-			continue;
-		}
+	int k;
+	if (!list_empty(&zone->active_anon_list[0]))
+		return;
+	if (!zone->active_anon_pages)
+		return;
+
+	for (k = 0; k < MAX_AGE; k++)  {
+		list_splice_init(&zone->active_anon_list[k+1], &zone->active_anon_list[k]);
+		zone->active_anon_count[k] = zone->active_anon_count[k+1];
+		zone->active_anon_count[k+1] = 0;
+	}
 
-		page = list_entry(entry, struct page, lru);
+	zone->anon_age_bias++;
+	speedup_aging(zone);
+}
 
-		BUG_ON(!PageLRU(page));
-		BUG_ON(PageActive(page));
+static inline void kachunk_cache(struct zone_struct * zone)
+{
+	int k;
+	if (!list_empty(&zone->active_cache_list[0]))
+		return;
+	if (!zone->active_cache_pages)
+		return;
+
+	for (k = 0; k < MAX_AGE; k++)  {
+		list_splice_init(&zone->active_cache_list[k+1], &zone->active_cache_list[k]);
+		zone->active_cache_count[k] = zone->active_cache_count[k+1];
+		zone->active_cache_count[k+1] = 0;
+	}
 
-		list_del(entry);
-		list_add(entry, &inactive_list);
+	zone->cache_age_bias++;
+	speedup_aging(zone);
+}
 
-		/*
-		 * Zero page counts can happen because we unlink the pages
-		 * _after_ decrementing the usage count..
-		 */
-		if (unlikely(!page_count(page)))
-			continue;
+#define BATCH_WORK_AMOUNT	64
 
-		if (!memclass(page_zone(page), classzone))
-			continue;
+/*
+ * returns the active cache ratio relative to the total active list
+ * times 100 (eg. 30% cache returns 30)
+ */
+static inline int cache_ratio(struct zone_struct * zone)
+{
+	if (!zone->size)
+		return 0;
+	return 100 * zone->active_cache_pages / (zone->active_cache_pages +
+			zone->active_anon_pages + 1);
+}
+
+struct cache_limits cache_limits = {
+	.min = 1,
+	.borrow = 15,
+	.max = 100,
+};
+
+/**
+ * refill_inactive_zone - scan the active list and find pages to deactivate
+ * @priority: how much are we allowed to scan
+ *
+ * This function will scan a portion of the active list of a zone to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_zone(struct zone_struct * zone, int priority, int target)
+{
+	int maxscan = (zone->active_anon_pages + zone->active_cache_pages) >> priority;
+	struct list_head * page_lru;
+	struct page * page;
+	int over_rsslimit;
+	int progress = 0;
+	int reclaim_anon = 0;
+	int reclaim_cache = 1;
+
+	/* Take the lock while messing with the list... */
+	lru_lock(zone);
+	if (target < BATCH_WORK_AMOUNT)
+		target = BATCH_WORK_AMOUNT;
+
+	if (cache_ratio(zone) < cache_limits.borrow)
+		reclaim_anon = 1;
+	if (cache_ratio(zone) < cache_limits.min)
+		reclaim_cache = 0;
+	/* Could happen if the sysadmin sets borrow below min... */
+	if (!reclaim_anon && !reclaim_cache)
+		reclaim_cache = reclaim_anon = 1;
+
+	while (maxscan-- && zone->active_anon_pages + zone->active_cache_pages > 0 && target > 0) {
+		int anon_work = 0, cache_work = 0;
+		if (reclaim_anon)
+			anon_work = BATCH_WORK_AMOUNT;
+		if (reclaim_cache)
+			cache_work = BATCH_WORK_AMOUNT;
+
+		while (--anon_work >= 0 && zone->active_anon_pages) {
+			if (list_empty(&zone->active_anon_list[0])) {
+				kachunk_anon(zone);
+				continue;
+			}
 
-		/* Racy check to avoid trylocking when not worthwhile */
-		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
-			goto page_mapped;
+			page_lru = zone->active_anon_list[0].prev;
+			page = list_entry(page_lru, struct page, lru);
 
-		/*
-		 * The page is locked. IO in progress?
-		 * Move it to the back of the list.
-		 */
-		if (unlikely(TryLockPage(page))) {
-			if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
-				page_cache_get(page);
-				spin_unlock(&pagemap_lru_lock);
-				wait_on_page(page);
-				page_cache_release(page);
-				spin_lock(&pagemap_lru_lock);
+			/* Wrong page on list?! (list corruption, should not happen) */
+			BUG_ON(unlikely(!PageActiveAnon(page)));
+		
+			/* Needed to follow page->mapping */
+			if (TryLockPage(page)) {
+				/* The page is already locked. This for sure means
+				 * someone is doing stuff with it which makes it
+				 * active by definition ;)
+				 */
+				del_page_from_active_anon_list(page);
+				add_page_to_active_anon_list(page, INITIAL_AGE);
+				continue;
 			}
-			continue;
-		}
 
-		if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
 			/*
-			 * It is not critical here to write it only if
-			 * the page is unmapped beause any direct writer
-			 * like O_DIRECT would set the PG_dirty bitflag
-			 * on the phisical page after having successfully
-			 * pinned it and after the I/O to the page is finished,
-			 * so the direct writes to the page cannot get lost.
+			 * Do aging on the pages.
 			 */
-			int (*writepage)(struct page *);
-
-			writepage = page->mapping->a_ops->writepage;
-			if ((gfp_mask & __GFP_FS) && writepage) {
-				ClearPageDirty(page);
-				SetPageLaunder(page);
-				page_cache_get(page);
-				spin_unlock(&pagemap_lru_lock);
-
-				writepage(page);
-				page_cache_release(page);
-
-				spin_lock(&pagemap_lru_lock);
+			pte_chain_lock(page);
+			if (page_referenced(page, &over_rsslimit) && !over_rsslimit) {
+				pte_chain_unlock(page);
+				age_page_up_nolock(page, 0);
+				UnlockPage(page);
 				continue;
 			}
+			pte_chain_unlock(page);
+
+			deactivate_page_nolock(page);
+			target--;
+			progress++;
+			UnlockPage(page);
 		}
 
-		/*
-		 * If the page has buffers, try to free the buffer mappings
-		 * associated with this page. If we succeed we try to free
-		 * the page as well.
-		 */
-		if (page->buffers) {
-			spin_unlock(&pagemap_lru_lock);
+		while (--cache_work >= 0 && zone->active_cache_pages) {
+			if (list_empty(&zone->active_cache_list[0])) {
+				kachunk_cache(zone);
+				continue;
+			}
 
-			/* avoid to free a locked page */
-			page_cache_get(page);
+			page_lru = zone->active_cache_list[0].prev;
+			page = list_entry(page_lru, struct page, lru);
 
-			if (try_to_release_page(page, gfp_mask)) {
-				if (!page->mapping) {
-					/*
-					 * We must not allow an anon page
-					 * with no buffers to be visible on
-					 * the LRU, so we unlock the page after
-					 * taking the lru lock
-					 */
-					spin_lock(&pagemap_lru_lock);
-					UnlockPage(page);
-					__lru_cache_del(page);
-
-					/* effectively free the page here */
-					page_cache_release(page);
-
-					if (--nr_pages)
-						continue;
-					break;
-				} else {
-					/*
-					 * The page is still in pagecache so undo the stuff
-					 * before the try_to_release_page since we've not
-					 * finished and we can now try the next step.
-					 */
-					page_cache_release(page);
+			/* Wrong page on list?! (list corruption, should not happen) */
+			BUG_ON(unlikely(!PageActiveCache(page)));
+		
+			/* Needed to follow page->mapping */
+			if (TryLockPage(page)) {
+				/* The page is already locked. This for sure means
+				 * someone is doing stuff with it which makes it
+				 * active by definition ;)
+				 */
+				del_page_from_active_cache_list(page);
+				add_page_to_active_cache_list(page, INITIAL_AGE);
+				continue;
+			}
 
-					spin_lock(&pagemap_lru_lock);
-				}
-			} else {
-				/* failed to drop the buffers so stop here */
+			/*
+			 * Do aging on the pages.
+			 */
+			pte_chain_lock(page);
+			if (page_referenced(page, &over_rsslimit) && !over_rsslimit) {
+				pte_chain_unlock(page);
+				age_page_up_nolock(page, 0);
 				UnlockPage(page);
-				page_cache_release(page);
-
-				spin_lock(&pagemap_lru_lock);
 				continue;
 			}
+			pte_chain_unlock(page);
+
+			deactivate_page_nolock(page);
+			target--;
+			progress++;
+			UnlockPage(page);
 		}
+	}
+	lru_unlock(zone);
+
+	return progress;
+}
 
-		spin_lock(&pagecache_lock);
+static int need_active_anon_scan(struct zone_struct * zone)
+{
+	int low = 0, high = 0;
+	int k;
+	for (k=0; k < MAX_AGE/2; k++)
+		low += zone->active_anon_count[k];
 
-		/*
-		 * this is the non-racy check for busy page.
-		 */
-		if (!page->mapping || !is_page_cache_freeable(page)) {
-			spin_unlock(&pagecache_lock);
-			UnlockPage(page);
-page_mapped:
-			if (--max_mapped >= 0)
-				continue;
+	for (k=MAX_AGE/2; k <= MAX_AGE; k++)
+		high += zone->active_anon_count[k];
+
+	if (high<low)
+		return 1;
+	return 0;
+}
+
+static int need_active_cache_scan(struct zone_struct * zone)
+{
+	int low = 0, high = 0;
+	int k;
+	for (k=0; k < MAX_AGE/2; k++)
+		low += zone->active_cache_count[k];
+
+	for (k=MAX_AGE/2; k <= MAX_AGE; k++)
+		high += zone->active_cache_count[k];
+
+	if (high<low)
+		return 1;
+	return 0;
+}
+
+static int scan_active_list(struct zone_struct * zone, int age,
+		struct list_head * list)
+{
+	struct list_head *page_lru , *next;
+	struct page * page;
+	int over_rsslimit;
+
+	/* Take the lock while messing with the list... */
+	lru_lock(zone);
+	list_for_each_safe(page_lru, next, list) {
+		page = list_entry(page_lru, struct page, lru);
+		pte_chain_lock(page);
+		if (page_referenced(page, &over_rsslimit) && !over_rsslimit)
+			age_page_up_nolock(page, age);
+		pte_chain_unlock(page);
+	}
+	lru_unlock(zone);
+	return 0;
+}
+
+/*
+ * Move max_work pages to the inactive clean list as long as there is a need
+ * for this. If gfp_mask allows it, sleep for IO to finish.
+ */
+int rebalance_laundry_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask)
+{
+	struct list_head * page_lru;
+	int max_loop;
+	int work_done = 0;
+	struct page * page;
+
+	max_loop = max_work;
+	if (max_loop < BATCH_WORK_AMOUNT)
+		max_loop = BATCH_WORK_AMOUNT;
+	/* Take the lock while messing with the list... */
+	lru_lock(zone);
+	while (max_loop-- && !list_empty(&zone->inactive_laundry_list)) {
+		page_lru = zone->inactive_laundry_list.prev;
+		page = list_entry(page_lru, struct page, lru);
 
+		/* Wrong page on list?! (list corruption, should not happen) */
+		BUG_ON(unlikely(!PageInactiveLaundry(page)));
+
+		/* TryLock to see if the page IO is done */
+		if (TryLockPage(page)) {
 			/*
-			 * Alert! We've found too many mapped pages on the
-			 * inactive list, so we start swapping out now!
+			 * Page is locked (IO in progress?). If we can sleep,
+			 * wait for it to finish, except when we've already
+			 * done enough work.
 			 */
-			spin_unlock(&pagemap_lru_lock);
-			swap_out(priority, gfp_mask, classzone);
-			return nr_pages;
+			if ((gfp_mask & __GFP_WAIT) && (work_done < max_work)) {
+				int timed_out;
+				
+				page_cache_get(page);
+				lru_unlock(zone);
+				run_task_queue(&tq_disk);
+				timed_out = wait_on_page_timeout(page, 5 * HZ);
+				page_cache_release(page);
+				lru_lock(zone);
+				/*
+				 * If we timed out and the page has been in
+				 * flight for over 30 seconds, this might not
+				 * be the best page to wait on; move it to
+				 * the head of the dirty list.
+				 */
+				if (timed_out & PageInactiveLaundry(page)) {
+					unsigned char now;
+					now = (jiffies/HZ)&255;
+					if (now - page->age > 30) {
+						del_page_from_inactive_laundry_list(page);
+						add_page_to_inactive_dirty_list(page);
+					}
+					continue;
+				}
+				/* We didn't make any progress for our caller,
+				 * but we are actively avoiding a livelock
+				 * so undo the decrement and wait on this page
+				 * some more, until IO finishes or we timeout.
+				 */
+				max_loop++;
+				continue;
+			} else
+				/* No dice, we can't wait for IO */
+				break;
 		}
+		UnlockPage(page);
 
 		/*
-		 * It is critical to check PageDirty _after_ we made sure
-		 * the page is freeable* so not in use by anybody.
+		 * If we get here either the IO on the page is done or
+		 * IO never happened because it was clean. Either way
+		 * move it to the inactive clean list.
 		 */
-		if (PageDirty(page)) {
-			spin_unlock(&pagecache_lock);
-			UnlockPage(page);
-			continue;
-		}
-
-		/* point of no return */
-		if (likely(!PageSwapCache(page))) {
-			__remove_inode_page(page);
-			spin_unlock(&pagecache_lock);
-		} else {
-			swp_entry_t swap;
-			swap.val = page->index;
-			__delete_from_swap_cache(page);
-			spin_unlock(&pagecache_lock);
-			swap_free(swap);
-		}
 
-		__lru_cache_del(page);
-		UnlockPage(page);
+		/* FIXME: check if the page is still clean or is accessed ? */
 
-		/* effectively free the page here */
-		page_cache_release(page);
+		del_page_from_inactive_laundry_list(page);
+		add_page_to_inactive_clean_list(page);
+		work_done++;
 
-		if (--nr_pages)
-			continue;
-		break;
+		/*
+		 * If we've done the minimal batch of work and there's
+		 * no longer a need to rebalance, abort now.
+		 */
+		if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_laundry(zone)))
+			break;
 	}
-	spin_unlock(&pagemap_lru_lock);
 
-	return nr_pages;
+	lru_unlock(zone);
+	return work_done;
 }
 
 /*
- * This moves pages from the active list to
- * the inactive list.
- *
- * We move them the other way when we see the
- * reference bit on the page.
+ * Move max_work pages from the dirty list as long as there is a need.
+ * Start IO if the gfp_mask allows it.
  */
-static void refill_inactive(int nr_pages)
+int rebalance_dirty_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask)
 {
-	struct list_head * entry;
-
-	spin_lock(&pagemap_lru_lock);
-	entry = active_list.prev;
-	while (nr_pages && entry != &active_list) {
-		struct page * page;
-
-		page = list_entry(entry, struct page, lru);
-		entry = entry->prev;
-		if (PageTestandClearReferenced(page)) {
-			list_del(&page->lru);
-			list_add(&page->lru, &active_list);
+	struct list_head * page_lru;
+	int max_loop;
+	int work_done = 0;
+	struct page * page;
+
+	max_loop = max_work;
+	if (max_loop < BATCH_WORK_AMOUNT)
+		max_loop = BATCH_WORK_AMOUNT;
+	/* Take the lock while messing with the list... */
+	lru_lock(zone);
+	while (max_loop-- && !list_empty(&zone->inactive_dirty_list)) {
+		page_lru = zone->inactive_dirty_list.prev;
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		BUG_ON(unlikely(!PageInactiveDirty(page)));
+
+		/*
+		 * Note: launder_page() sleeps so we can't safely look at
+		 * the page after this point!
+		 *
+		 * If we fail (only happens if we can't do IO) we just try
+		 * again on another page; launder_page makes sure we won't
+		 * see the same page over and over again.
+		 */
+		if (!launder_page(zone, gfp_mask, page))
 			continue;
-		}
 
-		nr_pages--;
+		work_done++;
 
-		del_page_from_active_list(page);
-		add_page_to_inactive_list(page);
-		SetPageReferenced(page);
+		/*
+		 * If we've done the minimal batch of work and there's
+		 * no longer any need to rebalance, abort now.
+		 */
+		if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_dirty(zone)))
+			break;
 	}
-	spin_unlock(&pagemap_lru_lock);
+	lru_unlock(zone);
+
+	return work_done;
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
+/* goal percentage sets the goal of the laundry+clean+free of the total zone size */
+int rebalance_inactive_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask, int goal_percentage)
 {
-	int chunk_size = nr_pages;
-	unsigned long ratio;
+	int ret = 0;
+	/* first deactivate memory */
+	if (((zone->inactive_laundry_pages + zone->inactive_clean_pages + zone->free_pages)*100 < zone->size * goal_percentage) &&
+			(inactive_high(zone) > 0))
+		refill_inactive_zone(zone, 0, max_work + BATCH_WORK_AMOUNT);
+
+	if (need_rebalance_dirty(zone))
+		ret += rebalance_dirty_zone(zone, max_work, gfp_mask);
+	if (need_rebalance_laundry(zone))
+		ret += rebalance_laundry_zone(zone, max_work, gfp_mask);
 
-	nr_pages -= kmem_cache_reap(gfp_mask);
-	if (nr_pages <= 0)
-		return 0;
+	/* These pages will become freeable, let the OOM detection know */
+	ret += zone->inactive_laundry_pages;
 
-	nr_pages = chunk_size;
-	/* try to keep the active list 2/3 of the size of the cache */
-	ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
-	refill_inactive(ratio);
+	return ret;
+}
 
-	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
-	if (nr_pages <= 0)
-		return 0;
+int rebalance_inactive(unsigned int gfp_mask, int percentage)
+{
+	struct zone_struct * zone;
+	int max_work;
+	int ret = 0;
+
+	max_work = 4 * BATCH_WORK_AMOUNT;
+	/* If we're in deeper trouble, do more work */
+	if (percentage >= 50)
+		max_work = 8 * BATCH_WORK_AMOUNT;
+
+	for_each_zone(zone)
+		ret += rebalance_inactive_zone(zone, max_work, gfp_mask, percentage);
+		/* 4 * BATCH_WORK_AMOUNT needs tuning */
 
-	shrink_dcache_memory(priority, gfp_mask);
-	shrink_icache_memory(priority, gfp_mask);
-#ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
+	return ret;
+}
+
+/**
+ * background_aging - slow background aging of zones
+ * @priority: priority at which to scan
+ *
+ * When the VM load is low or nonexistant, this function is
+ * called once a second to "sort" the pages in the VM. This
+ * way we know which pages to evict once a load spike happens.
+ * The effects of this function are very slow, the CPU usage
+ * should be minimal to nonexistant under most loads.
+ */
+static inline void background_aging(int priority)
+{
+	struct zone_struct * zone;
 
-	return nr_pages;
+	for_each_zone(zone)
+		if (inactive_low(zone) > 0)
+			refill_inactive_zone(zone, priority, BATCH_WORK_AMOUNT);
+	for_each_zone(zone)
+		if (free_plenty(zone) > 0)
+			rebalance_dirty_zone(zone, BATCH_WORK_AMOUNT, GFP_KSWAPD);
 }
 
-int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask)
 {
-	int priority = DEF_PRIORITY;
-	int nr_pages = SWAP_CLUSTER_MAX;
+	int ret = 0;
 
-	gfp_mask = pf_gfp_mask(gfp_mask);
-	do {
-		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
-		if (nr_pages <= 0)
-			return 1;
-	} while (--priority);
+	/*
+	 * Eat memory from filesystem page cache, buffer cache,
+	 * dentry, inode and filesystem quota caches.
+	 */
+	ret += rebalance_inactive(gfp_mask, 100);
+	ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+	ret += shrink_icache_memory(1, gfp_mask);
+#ifdef CONFIG_QUOTA
+	ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+#endif
+
+	/* 	
+	 * Reclaim unused slab cache memory.
+	 */
+	ret += kmem_cache_reap(gfp_mask);
 
 	/*
 	 * Hmm.. Cache shrink failed - time to kill something?
 	 * Mhwahahhaha! This is the part I really like. Giggle.
 	 */
-	out_of_memory();
-	return 0;
-}
-
-int try_to_free_pages(unsigned int gfp_mask)
-{
-	pg_data_t *pgdat;
-	zonelist_t *zonelist;
-	unsigned long pf_free_pages;
-	int error = 0;
-
-	pf_free_pages = current->flags & PF_FREE_PAGES;
-	current->flags &= ~PF_FREE_PAGES;
+	if (!ret && free_low(ANY_ZONE) && (gfp_mask&__GFP_WAIT))
+		out_of_memory();
 
-	for_each_pgdat(pgdat) {
-		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
-		error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
-	}
-
-	current->flags |= pf_free_pages;
-	return error;
+	return ret;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
-static int check_classzone_need_balance(zone_t * classzone)
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages_kswapd(unsigned int gfp_mask)
 {
-	zone_t * first_classzone;
+	int ret = 0;
+	struct zone_struct * zone;
 
-	first_classzone = classzone->zone_pgdat->node_zones;
-	while (classzone >= first_classzone) {
-		if (classzone->free_pages > classzone->pages_high)
-			return 0;
-		classzone--;
-	}
-	return 1;
-}
+	ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+	ret += shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+	ret += try_to_reclaim_buffers(DEF_PRIORITY, gfp_mask);
+#ifdef CONFIG_QUOTA
+	ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+#endif
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
-{
-	int need_more_balance = 0, i;
-	zone_t * zone;
+	/*
+	 * Eat memory from filesystem page cache, buffer cache,
+	 * dentry, inode and filesystem quota caches.
+	 */
+	rebalance_inactive(gfp_mask, 5);
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (unlikely(current->need_resched))
-			schedule();
-		if (!zone->need_balance)
-			continue;
-		if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
-			zone->need_balance = 0;
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
-			continue;
-		}
-		if (check_classzone_need_balance(zone))
-			need_more_balance = 1;
-		else
-			zone->need_balance = 0;
+	for_each_zone(zone) {
+		int maxloop = zone->inactive_dirty_pages;
+		maxloop = maxloop / (16 * BATCH_WORK_AMOUNT) + 1;
+		while (need_rebalance_dirty(zone) && maxloop-- > 0)
+			rebalance_dirty_zone(zone,  16 * BATCH_WORK_AMOUNT,  gfp_mask);
 	}
 
-	return need_more_balance;
-}
+	for_each_zone(zone)
+		if (free_high(zone)>0)
+			rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0);
 
-static void kswapd_balance(void)
-{
-	int need_more_balance;
-	pg_data_t * pgdat;
+	refill_freelist();
 
-	do {
-		need_more_balance = 0;
+	/* Start IO when needed. */
+	if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+		run_task_queue(&tq_disk);
 
-		for_each_pgdat(pgdat)
-			need_more_balance |= kswapd_balance_pgdat(pgdat);
-	} while (need_more_balance);
+	return ret;
 }
 
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+/**
+ * refill_freelist - move inactive_clean pages to free list if needed
+ *
+ * Move some pages from the inactive_clean lists to the free
+ * lists so atomic allocations have pages to work from. This
+ * function really only does something when we don't have a 
+ * userspace load on __alloc_pages().
+ *
+ * We refill the freelist in a bump from pages_min to pages_min * 2
+ * in order to give the buddy allocator something to play with.
+ */
+static void refill_freelist(void)
 {
+	struct page * page;
 	zone_t * zone;
-	int i;
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (!zone->need_balance)
+	for_each_zone(zone) {
+		if (!zone->size || zone->free_pages >= zone->pages_min)
 			continue;
-		return 0;
-	}
-
-	return 1;
-}
 
-static int kswapd_can_sleep(void)
-{
-	pg_data_t * pgdat;
-
-	for_each_pgdat(pgdat) {
-		if (!kswapd_can_sleep_pgdat(pgdat))
-			return 0;
+		while (zone->free_pages < zone->pages_min * 2) {
+			page = reclaim_page(zone);
+			if (!page)
+				break;
+			__free_page(page);
+		}
 	}
-
-	return 1;
 }
 
 /*
@@ -720,7 +976,6 @@
 int kswapd(void *unused)
 {
 	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
 
 	daemonize();
 	strcpy(tsk->comm, "kswapd");
@@ -744,31 +999,229 @@
 	 * Kswapd main loop.
 	 */
 	for (;;) {
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&kswapd_wait, &wait);
+		static long recalc = 0;
 
-		mb();
-		if (kswapd_can_sleep())
-			schedule();
+		/*
+		 * We try to rebalance the VM either when we have a
+		 * global shortage of free pages or when one particular
+		 * zone is very short on free pages.
+		 */
+		if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0)
+			do_try_to_free_pages_kswapd(GFP_KSWAPD);
+
+		refill_freelist();
+
+		/* Once a second ... */
+		if (time_after(jiffies, recalc + HZ)) {
+			recalc = jiffies;
+
+			/* Do background page aging. */
+			background_aging(DEF_PRIORITY);
+		}
+
+		wakeup_memwaiters();
+	}
+}
+
+static int kswapd_overloaded;
+unsigned int kswapd_minfree; /* initialized in mm/page_alloc.c */
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+
+/**
+ * wakeup_kswapd - wake up the pageout daemon
+ * gfp_mask: page freeing flags
+ *
+ * This function wakes up kswapd and can, under heavy VM pressure,
+ * put the calling task to sleep temporarily.
+ */
+void wakeup_kswapd(unsigned int gfp_mask)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* If we're in the memory freeing business ourself, don't sleep
+	 * but just wake kswapd and go back to businesss.
+	 */
+	if (current->flags & PF_MEMALLOC) {
+		wake_up_interruptible(&kswapd_wait);
+		return;
+	}
 
-		__set_current_state(TASK_RUNNING);
+	/* We need all of kswapd's GFP flags, otherwise we can't sleep on it.
+	 * We still wake kswapd of course.
+	 */
+	if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) {
+		wake_up_interruptible(&kswapd_wait);
+		return;
+	}
+	
+	add_wait_queue(&kswapd_done, &wait);
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        
+        /* Wake kswapd .... */
+        wake_up_interruptible(&kswapd_wait);
+        
+        /* ... and check if we need to wait on it */
+	if ((free_low(ALL_ZONES) > (kswapd_minfree / 2)) && !kswapd_overloaded)
+		schedule();
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&kswapd_done, &wait);
+}
+
+static void wakeup_memwaiters(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	        
+	add_wait_queue(&kswapd_wait, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* Don't let the processes waiting on memory get stuck, ever. */
+	wake_up(&kswapd_done);
+
+	/* Enough free RAM, we can easily keep up with memory demand. */
+	if (free_high(ALL_ZONES) <= 0) {
+		schedule_timeout(HZ);
 		remove_wait_queue(&kswapd_wait, &wait);
+		return;
+	}
+	remove_wait_queue(&kswapd_wait, &wait);
 
-		/*
-		 * If we actually get into a low-memory situation,
-		 * the processes needing more memory will wake us
-		 * up on a more timely basis.
-		 */
-		kswapd_balance();
-		run_task_queue(&tq_disk);
+	/* OK, the VM is very loaded. Sleep instead of using all CPU. */
+	kswapd_overloaded = 1;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(HZ / 40);
+	kswapd_overloaded = 0;
+	return;
+}
+
+/**
+ * try_to_free_pages - run the pageout code ourselves
+ * gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * When the load on the system gets higher, it can happen
+ * that kswapd no longer manages to keep enough memory
+ * free. In those cases user programs allocating memory
+ * will call try_to_free_pages() and help the pageout code.
+ * This has the effects of freeing memory and slowing down
+ * the largest memory hogs a bit.
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+	int ret = 1;
+
+	gfp_mask = pf_gfp_mask(gfp_mask);
+	if (gfp_mask & __GFP_WAIT) {
+		current->flags |= PF_MEMALLOC;
+		ret = do_try_to_free_pages(gfp_mask);
+		current->flags &= ~PF_MEMALLOC;
 	}
+
+	return ret;
+}
+
+/**
+ * rss_free_pages - run part of the pageout code and slow down a bit
+ * @gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * This function is called when a task is over its RSS limit and
+ * has a page fault.  It's goal is to free some memory so non-hogs
+ * can run faster and slow down itself when needed so it won't eat
+ * the memory non-hogs can use.
+ */
+void rss_free_pages(unsigned int gfp_mask)
+{
+	long pause = 0;
+	struct zone_struct * zone;
+
+	if (current->flags & PF_MEMALLOC)
+		return;
+
+	current->flags |= PF_MEMALLOC;
+
+	do {
+		rebalance_inactive(gfp_mask, 100);
+		for_each_zone(zone)
+			if (free_plenty(zone) >= 0)
+				rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0);
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(pause);
+		set_current_state(TASK_RUNNING);
+		pause++;
+	} while (free_high(ALL_ZONES) >= 0);
+
+	current->flags &= ~PF_MEMALLOC;
+	return;
 }
 
+/*
+ * The background page scanning daemon, started as a kernel thread
+ * from the init process. 
+ *
+ * This is the part that background scans the active list to find
+ * pages that are referenced and increases their age score.
+ * It is important that this scan rate is not proportional to vm pressure
+ * per se otherwise cpu usage becomes unbounded. On the other hand, if there's
+ * no VM pressure at all it shouldn't age stuff either otherwise everything
+ * ends up at the maximum age. 
+ */
+int kscand(void *unused)
+{
+	struct task_struct *tsk = current;
+	struct zone_struct * zone;
+	unsigned long iv;
+	int age;
+
+	daemonize();
+	strcpy(tsk->comm, "kscand");
+	sigfillset(&tsk->blocked);
+	
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(MIN_AGING_INTERVAL);	
+		for_each_zone(zone) {
+			if (time_before(jiffies, zone->age_next))
+				continue;
+
+			if (need_active_anon_scan(zone)) {
+				for (age = 0; age < MAX_AGE; age++)  {
+					scan_active_list(zone, age,
+						&zone->active_anon_list[age]);
+					if (current->need_resched)
+						schedule();
+				}
+			}
+
+			if (need_active_cache_scan(zone)) {
+				for (age = 0; age < MAX_AGE; age++)  {
+					scan_active_list(zone, age,
+						&zone->active_cache_list[age]);
+					if (current->need_resched)
+						schedule();
+				}
+			}
+
+			iv = zone->age_interval;
+			/* Check if we've been aging quickly enough ... */
+			if (zone->need_scan >= 2)
+				iv = max(iv / 2, MIN_AGING_INTERVAL);
+			/* ... or too quickly. */
+			else if (!zone->need_scan)
+				iv = max(iv + (iv / 2), MAX_AGING_INTERVAL);
+			zone->need_scan = 0;
+			zone->age_interval = iv;
+			zone->age_next = jiffies + iv;
+		}
+	}
+}
+
+
 static int __init kswapd_init(void)
 {
 	printk("Starting kswapd\n");
 	swap_setup();
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	kernel_thread(kscand, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 	return 0;
 }