# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.388 -> 1.393 # mm/oom_kill.c 1.9 -> 1.10 # include/linux/mmzone.h 1.7 -> 1.9 # fs/buffer.c 1.61 -> 1.62 # include/linux/swap.h 1.31 -> 1.32 # include/linux/elevator.h 1.4 -> 1.5 # include/linux/mm.h 1.38 -> 1.39 # mm/mmap.c 1.23 -> 1.24 # drivers/s390/ccwcache.c 1.3 -> 1.4 # mm/page_alloc.c 1.43 -> 1.44 # arch/arm/mm/mm-armv.c 1.5 -> 1.6 # include/linux/sched.h 1.29 -> 1.30 # kernel/fork.c 1.23 -> 1.24 # drivers/block/ll_rw_blk.c 1.34 -> 1.35 # kernel/sysctl.c 1.16 -> 1.17 # kernel/sys.c 1.9 -> 1.10 # Makefile 1.161 -> 1.162 # include/linux/swapctl.h 1.2 -> 1.3 # fs/dcache.c 1.16 -> 1.18 # fs/dquot.c 1.16 -> 1.18 # mm/vmscan.c 1.59 -> 1.61 # fs/proc/proc_misc.c 1.13 -> 1.14 # mm/swapfile.c 1.23 -> 1.24 # mm/slab.c 1.14 -> 1.16 # drivers/block/elevator.c 1.5 -> 1.6 # include/linux/fs.h 1.60 -> 1.61 # mm/bootmem.c 1.6 -> 1.7 # mm/filemap.c 1.62 -> 1.63 # fs/exec.c 1.20 -> 1.21 # mm/swap.c 1.16 -> 1.17 # mm/swap_state.c 1.17 -> 1.18 # mm/memory.c 1.50 -> 1.51 # fs/inode.c 1.32 -> 1.34 # include/linux/slab.h 1.8 -> 1.10 # arch/i386/kernel/setup.c 1.37 -> 1.38 # mm/mremap.c 1.5 -> 1.6 # mm/Makefile 1.5 -> 1.6 # (new) -> 1.1 include/asm-alpha/rmap.h # (new) -> 1.1 mm/rmap.c # (new) -> 1.1 include/asm-sparc64/rmap.h # (new) -> 1.1 include/asm-i386/rmap.h # (new) -> 1.1 include/asm-generic/rmap.h # (new) -> 1.1 include/asm-s390x/rmap.h # (new) -> 1.1 include/asm-ppc/rmap.h # (new) -> 1.1 include/asm-arm/rmap.h # (new) -> 1.1 include/asm-sparc/rmap.h # (new) -> 1.1 include/asm-mips/rmap.h # (new) -> 1.2 include/linux/mm_inline.h # (new) -> 1.1 include/asm-mips64/rmap.h # (new) -> 1.1 include/asm-s390/rmap.h # (new) -> 1.1 include/asm-parisc/rmap.h # (new) -> 1.1 include/asm-sh/rmap.h # (new) -> 1.1 include/asm-m68k/rmap.h # (new) -> 1.1 include/asm-ia64/rmap.h # (new) -> 1.2 Changelog.rmap # (new) -> 1.1 mm/TODO # (new) -> 1.1 include/asm-cris/rmap.h # (new) -> 1.1 include/asm-arm/proc-armv/rmap.h # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 02/04/07 riel@mirkwood.rielhome.conectiva 1.220.8.1 # imported rmap 12h for 2.4.19-pre3 in one block, migrate everything # to marcelo's bk tree # -------------------------------------------- # 02/04/07 riel@mirkwood.rielhome.conectiva 1.383.1.1 # merged # -------------------------------------------- # 02/04/08 riel@mirkwood.rielhome.conectiva 1.389 # Merge linuxvm@linuxvm.bkbits.net:linux-2.4-rmap # into mirkwood.rielhome.conectiva:/bkbits/linux-2.4-rmap # -------------------------------------------- # 02/04/09 riel@duckman.distro.conectiva 1.390 # remove compiler.h includes # -------------------------------------------- # 02/04/09 hch@sb.bsdonline.org 1.389.1.1 # Remove kmem_cache_shrink_nr, the non-prefixed version now has a sane # return value. # -------------------------------------------- # 02/04/09 riel@duckman.distro.conectiva 1.391 # Merge # -------------------------------------------- # 02/04/09 riel@duckman.distro.conectiva 1.392 # rmap 12i # -------------------------------------------- # 02/04/09 riel@duckman.distro.conectiva 1.393 # remove extra tab from mmzone.h # -------------------------------------------- # diff -Nru a/Changelog.rmap b/Changelog.rmap --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/Changelog.rmap Tue Apr 9 17:36:56 2002 @@ -0,0 +1,152 @@ +The ninth maintenance release of the 12th version of the reverse +mapping based VM is now available. +This is an attempt at making a more robust and flexible VM +subsystem, while cleaning up a lot of code at the same time. +The patch is available from: + + http://surriel.com/patches/2.4/2.4.19p6-rmap-12i +and http://linuxvm.bkbits.net/ + + +My big TODO items for a next release are: + - O(1) page launder - currently functional but slow, needs to be tuned + - pte-highmem + - fine grained locking for SMP and NUMA (William Lee Irwin) + +rmap 12i: + - slab cleanup (Christoph Hellwig) + - remove references to compiler.h from mm/* (me) + - move rmap to marcelo's bk tree (me) + - minor cleanups (me) +rmap 12h: + - hopefully fix OOM detection algorithm (me) + - drop pte quicklist in anticipation of pte-highmem (me) + - replace andrea's highmem emulation by ingo's one (me) + - improve rss limit checking (Nick Piggin) +rmap 12g: + - port to armv architecture (David Woodhouse) + - NUMA fix to zone_table initialisation (Samuel Ortiz) + - remove init_page_count (David Miller) +rmap 12f: + - for_each_pgdat macro (William Lee Irwin) + - put back EXPORT(__find_get_page) for modular rd (me) + - make bdflush and kswapd actually start queued disk IO (me) +rmap 12e + - RSS limit fix, the limit can be 0 for some reason (me) + - clean up for_each_zone define to not need pgdata_t (William Lee Irwin) + - fix i810_dma bug introduced with page->wait removal (William Lee Irwin) +rmap 12d: + - fix compiler warning in rmap.c (Roger Larsson) + - read latency improvement (read-latency2) (Andrew Morton) +rmap 12c: + - fix small balancing bug in page_launder_zone (Nick Piggin) + - wakeup_kswapd / wakeup_memwaiters code fix (Arjan van de Ven) + - improve RSS limit enforcement (me) +rmap 12b: + - highmem emulation (for debugging purposes) (Andrea Arcangeli) + - ulimit RSS enforcement when memory gets tight (me) + - sparc64 page->virtual quickfix (Greg Procunier) +rmap 12a: + - fix the compile warning in buffer.c (me) + - fix divide-by-zero on highmem initialisation DOH! (me) + - remove the pgd quicklist (suspicious ...) (DaveM, me) +rmap 12: + - keep some extra free memory on large machines (Arjan van de Ven, me) + - higher-order allocation bugfix (Adrian Drzewiecki) + - nr_free_buffer_pages() returns inactive + free mem (me) + - pages from unused objects directly to inactive_clean (me) + - use fast pte quicklists on non-pae machines (Andrea Arcangeli) + - remove sleep_on from wakeup_kswapd (Arjan van de Ven) + - page waitqueue cleanup (Christoph Hellwig) +rmap 11c: + - oom_kill race locking fix (Andres Salomon) + - elevator improvement (Andrew Morton) + - dirty buffer writeout speedup (hopefully ;)) (me) + - small documentation updates (me) + - page_launder() never does synchronous IO, kswapd + and the processes calling it sleep on higher level (me) + - deadlock fix in touch_page() (me) +rmap 11b: + - added low latency reschedule points in vmscan.c (me) + - make i810_dma.c include mm_inline.h too (William Lee Irwin) + - wake up kswapd sleeper tasks on OOM kill so the + killed task can continue on its way out (me) + - tune page allocation sleep point a little (me) +rmap 11a: + - don't let refill_inactive() progress count for OOM (me) + - after an OOM kill, wait 5 seconds for the next kill (me) + - agpgart_be fix for hashed waitqueues (William Lee Irwin) +rmap 11: + - fix stupid logic inversion bug in wakeup_kswapd() (Andrew Morton) + - fix it again in the morning (me) + - add #ifdef BROKEN_PPC_PTE_ALLOC_ONE to rmap.h, it + seems PPC calls pte_alloc() before mem_map[] init (me) + - disable the debugging code in rmap.c ... the code + is working and people are running benchmarks (me) + - let the slab cache shrink functions return a value + to help prevent early OOM killing (Ed Tomlinson) + - also, don't call the OOM code if we have enough + free pages (me) + - move the call to lru_cache_del into __free_pages_ok (Ben LaHaise) + - replace the per-page waitqueue with a hashed + waitqueue, reduces size of struct page from 64 + bytes to 52 bytes (48 bytes on non-highmem machines) (William Lee Irwin) +rmap 10: + - fix the livelock for real (yeah right), turned out + to be a stupid bug in page_launder_zone() (me) + - to make sure the VM subsystem doesn't monopolise + the CPU, let kswapd and some apps sleep a bit under + heavy stress situations (me) + - let __GFP_HIGH allocations dig a little bit deeper + into the free page pool, the SCSI layer seems fragile (me) +rmap 9: + - improve comments all over the place (Michael Cohen) + - don't panic if page_remove_rmap() cannot find the + rmap in question, it's possible that the memory was + PG_reserved and belonging to a driver, but the driver + exited and cleared the PG_reserved bit (me) + - fix the VM livelock by replacing > by >= in a few + critical places in the pageout code (me) + - treat the reclaiming of an inactive_clean page like + allocating a new page, calling try_to_free_pages() + and/or fixup_freespace() if required (me) + - when low on memory, don't make things worse by + doing swapin_readahead (me) +rmap 8: + - add ANY_ZONE to the balancing functions to improve + kswapd's balancing a bit (me) + - regularize some of the maximum loop bounds in + vmscan.c for cosmetic purposes (William Lee Irwin) + - move page_address() to architecture-independent + code, now the removal of page->virtual is portable (William Lee Irwin) + - speed up free_area_init_core() by doing a single + pass over the pages and not using atomic ops (William Lee Irwin) + - documented the buddy allocator in page_alloc.c (William Lee Irwin) +rmap 7: + - clean up and document vmscan.c (me) + - reduce size of page struct, part one (William Lee Irwin) + - add rmap.h for other archs (untested, not for ARM) (me) +rmap 6: + - make the active and inactive_dirty list per zone, + this is finally possible because we can free pages + based on their physical address (William Lee Irwin) + - cleaned up William's code a bit (me) + - turn some defines into inlines and move those to + mm_inline.h (the includes are a mess ...) (me) + - improve the VM balancing a bit (me) + - add back inactive_target to /proc/meminfo (me) +rmap 5: + - fixed recursive buglet, introduced by directly + editing the patch for making rmap 4 ;))) (me) +rmap 4: + - look at the referenced bits in page tables (me) +rmap 3: + - forgot one FASTCALL definition (me) +rmap 2: + - teach try_to_unmap_one() about mremap() (me) + - don't assign swap space to pages with buffers (me) + - make the rmap.c functions FASTCALL / inline (me) +rmap 1: + - fix the swap leak in rmap 0 (Dave McCracken) +rmap 0: + - port of reverse mapping VM to 2.4.16 (me) diff -Nru a/Makefile b/Makefile --- a/Makefile Tue Apr 9 17:36:56 2002 +++ b/Makefile Tue Apr 9 17:36:56 2002 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 19 -EXTRAVERSION = -pre6 +EXTRAVERSION = -pre6-rmap12i KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nru a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c --- a/arch/arm/mm/mm-armv.c Tue Apr 9 17:36:56 2002 +++ b/arch/arm/mm/mm-armv.c Tue Apr 9 17:36:56 2002 @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -470,6 +471,7 @@ * cache implementation. */ kmem_cache_t *pte_cache; +kmem_cache_t *pte_rmap_cache; /* * The constructor gets called for each object within the cache when the @@ -480,6 +482,22 @@ { unsigned long block = (unsigned long)pte; + if (!(block & 2048)) { + /* First object of two in a page - allocate the + pte_rmap_info to go with them */ + + struct page * page = virt_to_page(pte); + + if (flags & SLAB_CTOR_ATOMIC) + BUG(); + + page->mapping = kmem_cache_alloc(pte_rmap_cache, GFP_KERNEL); + if (!page->mapping) { + printk(KERN_CRIT "pte_rmap_cache alloc failed. Oops. Slab constructors need to be allowed to fail\n"); + /* return -ENOMEM; */ + BUG(); + } + } if (block & 2047) BUG(); @@ -488,11 +506,32 @@ PTRS_PER_PTE * sizeof(pte_t), 0); } +static void pte_cache_dtor(void *pte, kmem_cache_t *cache, unsigned long flags) +{ + unsigned long block = (unsigned long)pte; + + if (!(block & 2048)) { + /* First object of two in a page - free the + pte_rmap_info that was associated with them */ + + struct page * page = virt_to_page(pte); + + kmem_cache_free(pte_rmap_cache, page->mapping); + page->mapping = NULL; + } +} + void __init pgtable_cache_init(void) { + pte_rmap_cache = kmem_cache_create("pte-rmap-cache", + 2 * sizeof(struct arm_rmap_info), 0, 0, + NULL, NULL); + if (!pte_rmap_cache) + BUG(); + pte_cache = kmem_cache_create("pte-cache", 2 * PTRS_PER_PTE * sizeof(pte_t), 0, 0, - pte_cache_ctor, NULL); + pte_cache_ctor, pte_cache_dtor); if (!pte_cache) BUG(); } diff -Nru a/drivers/block/elevator.c b/drivers/block/elevator.c --- a/drivers/block/elevator.c Tue Apr 9 17:36:56 2002 +++ b/drivers/block/elevator.c Tue Apr 9 17:36:56 2002 @@ -80,30 +80,38 @@ struct buffer_head *bh, int rw, int max_sectors) { - struct list_head *entry = &q->queue_head; - unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE; - + struct list_head *entry; + unsigned int count = bh->b_size >> 9; + unsigned int ret = ELEVATOR_NO_MERGE; + int merge_only = 0; + const int max_bomb_segments = q->elevator.max_bomb_segments; + + entry = &q->queue_head; while ((entry = entry->prev) != head) { struct request *__rq = blkdev_entry_to_request(entry); - /* - * simply "aging" of requests in queue - */ - if (__rq->elevator_sequence-- <= 0) - break; - + if (__rq->elevator_sequence-- <= 0) { + /* + * OK, we've exceeded someone's latency limit. + * But we still continue to look for merges, + * because they're so much better than seeks. + */ + merge_only = 1; + } if (__rq->waiting) continue; if (__rq->rq_dev != bh->b_rdev) continue; - if (!*req && bh_rq_in_between(bh, __rq, &q->queue_head)) + if (!*req && !merge_only && + bh_rq_in_between(bh, __rq, &q->queue_head)) { *req = __rq; + } if (__rq->cmd != rw) continue; if (__rq->nr_sectors + count > max_sectors) continue; if (__rq->elevator_sequence < count) - break; + merge_only = 1; if (__rq->sector + __rq->nr_sectors == bh->b_rsector) { ret = ELEVATOR_BACK_MERGE; *req = __rq; @@ -116,6 +124,56 @@ } } + /* + * If we failed to merge a read anywhere in the request + * queue, we really don't want to place it at the end + * of the list, behind lots of writes. So place it near + * the front. + * + * We don't want to place it in front of _all_ writes: that + * would create lots of seeking, and isn't tunable. + * We try to avoid promoting this read in front of existing + * reads. + * + * max_bomb_sectors becomes the maximum number of write + * requests which we allow to remain in place in front of + * a newly introduced read. We weight things a little bit, + * so large writes are more expensive than small ones, but it's + * requests which count, not sectors. + */ + if (max_bomb_segments && rw == READ && ret == ELEVATOR_NO_MERGE) { + int cur_latency = 0; + struct request * const cur_request = *req; + + entry = head->next; + while (entry != &q->queue_head) { + struct request *__rq; + + if (entry == &q->queue_head) + BUG(); + if (entry == q->queue_head.next && + q->head_active && !q->plugged) + BUG(); + __rq = blkdev_entry_to_request(entry); + + if (__rq == cur_request) { + /* + * This is where the old algorithm placed it. + * There's no point pushing it further back, + * so leave it here, in sorted order. + */ + break; + } + if (__rq->cmd == WRITE) { + cur_latency += 1 + __rq->nr_sectors / 64; + if (cur_latency >= max_bomb_segments) { + *req = __rq; + break; + } + } + entry = entry->next; + } + } return ret; } @@ -188,7 +246,7 @@ output.queue_ID = elevator->queue_ID; output.read_latency = elevator->read_latency; output.write_latency = elevator->write_latency; - output.max_bomb_segments = 0; + output.max_bomb_segments = elevator->max_bomb_segments; if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t))) return -EFAULT; @@ -207,9 +265,12 @@ return -EINVAL; if (input.write_latency < 0) return -EINVAL; + if (input.max_bomb_segments < 0) + return -EINVAL; elevator->read_latency = input.read_latency; elevator->write_latency = input.write_latency; + elevator->max_bomb_segments = input.max_bomb_segments; return 0; } diff -Nru a/drivers/s390/ccwcache.c b/drivers/s390/ccwcache.c --- a/drivers/s390/ccwcache.c Tue Apr 9 17:36:56 2002 +++ b/drivers/s390/ccwcache.c Tue Apr 9 17:36:56 2002 @@ -291,9 +291,11 @@ /* Shrink the caches, if available */ for ( cachind = 0; cachind < CCW_NUMBER_CACHES; cachind ++ ) { if ( ccw_cache[cachind] ) { - if ( kmem_cache_shrink(ccw_cache[cachind]) == 0 ) { - ccw_cache[cachind] = NULL; - } + /* + * lessons learned today: + * 1) never ever call kmem_cache_destroy on a nul ptr. + * ... + */ kmem_cache_destroy(ccw_cache[cachind]); } } diff -Nru a/fs/buffer.c b/fs/buffer.c --- a/fs/buffer.c Tue Apr 9 17:36:56 2002 +++ b/fs/buffer.c Tue Apr 9 17:36:56 2002 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -727,11 +728,9 @@ static void free_more_memory(void) { - zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - balance_dirty(); wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); + try_to_free_pages(GFP_NOFS); run_task_queue(&tq_disk); current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); @@ -2601,63 +2600,23 @@ return 1; } -/* - * The first time the VM inspects a page which has locked buffers, it - * will just mark it as needing waiting upon on the scan of the page LRU. - * BH_Wait_IO is used for this. - * - * The second time the VM visits the page, if it still has locked - * buffers, it is time to start writing them out. (BH_Wait_IO was set). - * - * The third time the VM visits the page, if the I/O hasn't completed - * then it's time to wait upon writeout. BH_Lock and BH_Launder are - * used for this. - * - * There is also the case of buffers which were locked by someone else - * - write(2) callers, bdflush, etc. There can be a huge number of these - * and we don't want to just skip them all and fail the page allocation. - * We want to be able to wait on these buffers as well. - * - * The BH_Launder bit is set in submit_bh() to indicate that I/O is - * underway against the buffer, doesn't matter who started it - we know - * that the buffer will eventually come unlocked, and so it's safe to - * wait on it. - * - * The caller holds the page lock and the caller will free this page - * into current->local_page, so by waiting on the page's buffers the - * caller is guaranteed to obtain this page. - * - * sync_page_buffers() will sort-of return true if all the buffers - * against this page are freeable, so try_to_free_buffers() should - * try to free the page's buffers a second time. This is a bit - * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly. - */ -static int sync_page_buffers(struct buffer_head *head) +static void sync_page_buffers(struct buffer_head *head) { struct buffer_head * bh = head; - int tryagain = 1; do { if (!buffer_dirty(bh) && !buffer_locked(bh)) continue; /* Don't start IO first time around.. */ - if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) { - tryagain = 0; + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) continue; - } - /* Second time through we start actively writing out.. */ - if (test_and_set_bit(BH_Lock, &bh->b_state)) { - if (unlikely(!buffer_launder(bh))) { - tryagain = 0; - continue; - } - wait_on_buffer(bh); - tryagain = 1; + /* If we cannot lock the buffer just skip it. */ + if (test_and_set_bit(BH_Lock, &bh->b_state)) continue; - } + /* Second time through we start actively writing out.. */ if (!atomic_set_buffer_clean(bh)) { unlock_buffer(bh); continue; @@ -2667,10 +2626,9 @@ get_bh(bh); bh->b_end_io = end_buffer_io_sync; submit_bh(WRITE, bh); - tryagain = 0; } while ((bh = bh->b_this_page) != head); - return tryagain; + return; } /* @@ -2694,7 +2652,6 @@ { struct buffer_head * tmp, * bh = page->buffers; -cleaned_buffers_try_again: spin_lock(&lru_list_lock); write_lock(&hash_table_lock); tmp = bh; @@ -2737,15 +2694,9 @@ write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); gfp_mask = pf_gfp_mask(gfp_mask); - if (gfp_mask & __GFP_IO) { - if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) { - if (sync_page_buffers(bh)) { - /* no IO or waiting next time */ - gfp_mask = 0; - goto cleaned_buffers_try_again; - } - } - } + if ((gfp_mask & __GFP_IO) && + ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page))) + sync_page_buffers(bh); if (balance_dirty_state() >= 0) wakeup_bdflush(); return 0; @@ -3010,8 +2961,10 @@ break; ndirty -= NRSYNC; } - if (ndirty > 0 || bdflush_stop()) + if (ndirty > 0 || bdflush_stop()) { + run_task_queue(&tq_disk); interruptible_sleep_on(&bdflush_wait); + } } } diff -Nru a/fs/dcache.c b/fs/dcache.c --- a/fs/dcache.c Tue Apr 9 17:36:56 2002 +++ b/fs/dcache.c Tue Apr 9 17:36:56 2002 @@ -568,8 +568,7 @@ count = dentry_stat.nr_unused / priority; prune_dcache(count); - kmem_cache_shrink(dentry_cache); - return 0; + return kmem_cache_shrink(dentry_cache); } #define NAME_ALLOC_LEN(len) ((len+16) & ~15) diff -Nru a/fs/dquot.c b/fs/dquot.c --- a/fs/dquot.c Tue Apr 9 17:36:56 2002 +++ b/fs/dquot.c Tue Apr 9 17:36:56 2002 @@ -413,8 +413,7 @@ lock_kernel(); prune_dqcache(nr_free_dquots / (priority + 1)); unlock_kernel(); - kmem_cache_shrink(dquot_cachep); - return 0; + return kmem_cache_shrink(dquot_cachep); } /* NOTE: If you change this function please check whether dqput_blocks() works right... */ diff -Nru a/fs/exec.c b/fs/exec.c --- a/fs/exec.c Tue Apr 9 17:36:56 2002 +++ b/fs/exec.c Tue Apr 9 17:36:56 2002 @@ -35,6 +35,7 @@ #include #include #include +#include #define __NO_VERSION__ #include @@ -279,6 +280,7 @@ flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + page_add_rmap(page, pte); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); diff -Nru a/fs/inode.c b/fs/inode.c --- a/fs/inode.c Tue Apr 9 17:36:56 2002 +++ b/fs/inode.c Tue Apr 9 17:36:56 2002 @@ -725,8 +725,7 @@ count = inodes_stat.nr_unused / priority; prune_icache(count); - kmem_cache_shrink(inode_cachep); - return 0; + return kmem_cache_shrink(inode_cachep); } /* diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c Tue Apr 9 17:36:56 2002 +++ b/fs/proc/proc_misc.c Tue Apr 9 17:36:56 2002 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -166,7 +167,9 @@ "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" "Active: %8u kB\n" - "Inactive: %8u kB\n" + "Inact_dirty: %8u kB\n" + "Inact_clean: %8u kB\n" + "Inact_target: %8lu kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -180,7 +183,9 @@ K(pg_size - swapper_space.nrpages), K(swapper_space.nrpages), K(nr_active_pages), - K(nr_inactive_pages), + K(nr_inactive_dirty_pages), + K(nr_inactive_clean_pages), + K(inactive_target()), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), diff -Nru a/include/asm-alpha/rmap.h b/include/asm-alpha/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-alpha/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _ALPHA_RMAP_H +#define _ALPHA_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-arm/proc-armv/rmap.h b/include/asm-arm/proc-armv/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-arm/proc-armv/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,72 @@ +#ifndef _ARMV_RMAP_H +#define _ARMV_RMAP_H +/* + * linux/include/asm-arm/proc-armv/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * + * We use the struct page of the page table page to find a pointer + * to an array of two 'struct arm_rmap_info's, one for each of the + * two page tables in each page. + * + * - rmi->mm points to the process' mm_struct + * - rmi->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +struct arm_rmap_info { + struct mm_struct *mm; + unsigned long index; +}; + +static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + rmi->mm = mm; + rmi->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + rmi->mm = NULL; + rmi->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + return rmi->mm; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + unsigned long low_bits; + + if (((unsigned long)ptep)&2048) + rmi++; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return rmi->index + low_bits; +} + +#endif /* _ARMV_RMAP_H */ diff -Nru a/include/asm-arm/rmap.h b/include/asm-arm/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-arm/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,6 @@ +#ifndef _ARM_RMAP_H +#define _ARM_RMAP_H + +#include + +#endif /* _ARM_RMAP_H */ diff -Nru a/include/asm-cris/rmap.h b/include/asm-cris/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-cris/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _CRIS_RMAP_H +#define _CRIS_RMAP_H + +/* nothing to see, move along :) */ +#include + +#endif diff -Nru a/include/asm-generic/rmap.h b/include/asm-generic/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-generic/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,57 @@ +#ifndef _GENERIC_RMAP_H +#define _GENERIC_RMAP_H +/* + * linux/include/asm-generic/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * this version should work for most architectures with a + * 'normal' page table layout. + * + * We use the struct page of the page table page to find out + * the process and full address of a page table entry: + * - page->mapping points to the process' mm_struct + * - page->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + page->mapping = NULL; + page->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + return (struct mm_struct *) page->mapping; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + unsigned long low_bits; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return page->index + low_bits; +} + +#endif /* _GENERIC_RMAP_H */ diff -Nru a/include/asm-i386/rmap.h b/include/asm-i386/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-i386/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _I386_RMAP_H +#define _I386_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-ia64/rmap.h b/include/asm-ia64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-ia64/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _IA64_RMAP_H +#define _IA64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-m68k/rmap.h b/include/asm-m68k/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-m68k/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _M68K_RMAP_H +#define _M68K_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-mips/rmap.h b/include/asm-mips/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-mips/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _MIPS_RMAP_H +#define _MIPS_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-mips64/rmap.h b/include/asm-mips64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-mips64/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _MIPS64_RMAP_H +#define _MIPS64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-parisc/rmap.h b/include/asm-parisc/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-parisc/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _PARISC_RMAP_H +#define _PARISC_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-ppc/rmap.h b/include/asm-ppc/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-ppc/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,9 @@ +#ifndef _PPC_RMAP_H +#define _PPC_RMAP_H + +/* PPC calls pte_alloc() before mem_map[] is setup ... */ +#define BROKEN_PPC_PTE_ALLOC_ONE + +#include + +#endif diff -Nru a/include/asm-s390/rmap.h b/include/asm-s390/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-s390/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _S390_RMAP_H +#define _S390_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-s390x/rmap.h b/include/asm-s390x/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-s390x/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _S390X_RMAP_H +#define _S390X_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-sh/rmap.h b/include/asm-sh/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-sh/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _SH_RMAP_H +#define _SH_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-sparc/rmap.h b/include/asm-sparc/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-sparc/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _SPARC_RMAP_H +#define _SPARC_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/asm-sparc64/rmap.h b/include/asm-sparc64/rmap.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-sparc64/rmap.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,7 @@ +#ifndef _SPARC64_RMAP_H +#define _SPARC64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -Nru a/include/linux/elevator.h b/include/linux/elevator.h --- a/include/linux/elevator.h Tue Apr 9 17:36:56 2002 +++ b/include/linux/elevator.h Tue Apr 9 17:36:56 2002 @@ -1,12 +1,9 @@ #ifndef _LINUX_ELEVATOR_H #define _LINUX_ELEVATOR_H -typedef void (elevator_fn) (struct request *, elevator_t *, - struct list_head *, - struct list_head *, int); - -typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct list_head *, - struct buffer_head *, int, int); +typedef int (elevator_merge_fn)(request_queue_t *, struct request **, + struct list_head *, struct buffer_head *bh, + int rw, int max_sectors); typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, int); @@ -16,6 +13,7 @@ { int read_latency; int write_latency; + int max_bomb_segments; elevator_merge_fn *elevator_merge_fn; elevator_merge_cleanup_fn *elevator_merge_cleanup_fn; @@ -24,13 +22,13 @@ unsigned int queue_ID; }; -int elevator_noop_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int); -void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int); -void elevator_noop_merge_req(struct request *, struct request *); - -int elevator_linus_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int); -void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int); -void elevator_linus_merge_req(struct request *, struct request *); +elevator_merge_fn elevator_noop_merge; +elevator_merge_cleanup_fn elevator_noop_merge_cleanup; +elevator_merge_req_fn elevator_noop_merge_req; + +elevator_merge_fn elevator_linus_merge; +elevator_merge_cleanup_fn elevator_linus_merge_cleanup; +elevator_merge_req_fn elevator_linus_merge_req; typedef struct blkelv_ioctl_arg_s { int queue_ID; @@ -54,22 +52,6 @@ #define ELEVATOR_FRONT_MERGE 1 #define ELEVATOR_BACK_MERGE 2 -/* - * This is used in the elevator algorithm. We don't prioritise reads - * over writes any more --- although reads are more time-critical than - * writes, by treating them equally we increase filesystem throughput. - * This turns out to give better overall performance. -- sct - */ -#define IN_ORDER(s1,s2) \ - ((((s1)->rq_dev == (s2)->rq_dev && \ - (s1)->sector < (s2)->sector)) || \ - (s1)->rq_dev < (s2)->rq_dev) - -#define BHRQ_IN_ORDER(bh, rq) \ - ((((bh)->b_rdev == (rq)->rq_dev && \ - (bh)->b_rsector < (rq)->sector)) || \ - (bh)->b_rdev < (rq)->rq_dev) - static inline int elevator_request_latency(elevator_t * elevator, int rw) { int latency; @@ -85,7 +67,7 @@ ((elevator_t) { \ 0, /* read_latency */ \ 0, /* write_latency */ \ - \ + 0, /* max_bomb_segments */ \ elevator_noop_merge, /* elevator_merge_fn */ \ elevator_noop_merge_cleanup, /* elevator_merge_cleanup_fn */ \ elevator_noop_merge_req, /* elevator_merge_req_fn */ \ @@ -95,7 +77,7 @@ ((elevator_t) { \ 8192, /* read passovers */ \ 16384, /* write passovers */ \ - \ + 6, /* max_bomb_segments */ \ elevator_linus_merge, /* elevator_merge_fn */ \ elevator_linus_merge_cleanup, /* elevator_merge_cleanup_fn */ \ elevator_linus_merge_req, /* elevator_merge_req_fn */ \ diff -Nru a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h Tue Apr 9 17:36:56 2002 +++ b/include/linux/fs.h Tue Apr 9 17:36:56 2002 @@ -285,7 +285,7 @@ extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); -#define touch_buffer(bh) mark_page_accessed(bh->b_page) +#define touch_buffer(bh) touch_page(bh->b_page) #include diff -Nru a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h Tue Apr 9 17:36:56 2002 +++ b/include/linux/mm.h Tue Apr 9 17:36:56 2002 @@ -18,9 +18,6 @@ extern unsigned long num_mappedpages; extern void * high_memory; extern int page_cluster; -/* The inactive_clean lists are per zone. */ -extern struct list_head active_list; -extern struct list_head inactive_list; #include #include @@ -134,6 +131,9 @@ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); }; +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -160,6 +160,8 @@ updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ + unsigned char age; /* Page aging counter. */ + struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */ struct page **pprev_hash; /* Complement to *next_hash. */ struct buffer_head * buffers; /* Buffer maps us to a disk block. */ @@ -287,9 +289,9 @@ #define PG_referenced 2 #define PG_uptodate 3 #define PG_dirty 4 -#define PG_unused 5 -#define PG_lru 6 -#define PG_active 7 +#define PG_inactive_clean 5 +#define PG_active 6 +#define PG_inactive_dirty 7 #define PG_slab 8 #define PG_skip 10 #define PG_highmem 11 @@ -391,10 +393,19 @@ #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) + +#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) +#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) +#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) + +#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) +#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) +#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) -#define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define PageLRU(pp) \ + (PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp)) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) @@ -459,6 +470,7 @@ #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0) +extern void FASTCALL(fixup_freespace(struct zone_struct *, int)); extern void show_free_areas(void); extern void show_free_areas_node(pg_data_t *pgdat); diff -Nru a/include/linux/mm_inline.h b/include/linux/mm_inline.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/mm_inline.h Tue Apr 9 17:36:56 2002 @@ -0,0 +1,294 @@ +#ifndef _LINUX_MM_INLINE_H +#define _LINUX_MM_INLINE_H + +#include + +/* + * These inline functions tend to need bits and pieces of all the + * other VM include files, meaning they cannot be defined inside + * one of the other VM include files. + * + * The include file mess really needs to be cleaned up... + */ + +static inline void add_page_to_active_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageActive(page); + list_add(&page->lru, &zone->active_list); + zone->active_pages++; + nr_active_pages++; +} + +static inline void add_page_to_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveDirty(page); + list_add(&page->lru, &zone->inactive_dirty_list); + zone->inactive_dirty_pages++; + nr_inactive_dirty_pages++; +} + +static inline void add_page_to_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveClean(page); + list_add(&page->lru, &zone->inactive_clean_list); + zone->inactive_clean_pages++; + nr_inactive_clean_pages++; +} + +static inline void del_page_from_active_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageActive(page); + nr_active_pages--; + zone->active_pages--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveDirty(page); + nr_inactive_dirty_pages--; + zone->inactive_dirty_pages--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveClean(page); + zone->inactive_clean_pages--; + nr_inactive_clean_pages--; + DEBUG_LRU_PAGE(page); +} + +/* + * Inline functions to control some balancing in the VM. + * + * Note that we do both global and per-zone balancing, with + * most of the balancing done globally. + */ +#define PLENTY_FACTOR 2 +#define ALL_ZONES NULL +#define ANY_ZONE (struct zone_struct *)(~0UL) +#define INACTIVE_FACTOR 5 + +#define VM_MIN 0 +#define VM_LOW 1 +#define VM_HIGH 2 +#define VM_PLENTY 3 +static inline int zone_free_limit(struct zone_struct * zone, int limit) +{ + int free, target, delta; + + /* This is really nasty, but GCC should completely optimise it away. */ + if (limit == VM_MIN) + target = zone->pages_min; + else if (limit == VM_LOW) + target = zone->pages_low; + else if (limit == VM_HIGH) + target = zone->pages_high; + else + target = zone->pages_high * PLENTY_FACTOR; + + free = zone->free_pages + zone->inactive_clean_pages; + delta = target - free; + + return delta; +} + +static inline int free_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_free_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_free_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_free_limit(zone, limit); + } + + return shortage; +} + +/** + * free_min - test for critically low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a serious shortage of free and + * clean pages, zero or negative if there is no serious shortage. + */ +static inline int free_min(struct zone_struct * zone) +{ + return free_limit(zone, VM_MIN); +} + +/** + * free_low - test for low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a shortage of free and + * clean pages, zero or negative if there is no shortage. + */ +static inline int free_low(struct zone_struct * zone) +{ + return free_limit(zone, VM_LOW); +} + +/** + * free_high - test if amount of free pages is less than ideal + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free and clean + * pages is below kswapd's target, zero or negative if we + * have more than enough free and clean pages. + */ +static inline int free_high(struct zone_struct * zone) +{ + return free_limit(zone, VM_HIGH); +} + +/** + * free_plenty - test if enough pages are freed + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free + clean pages + * in a zone is not yet excessive and kswapd is still allowed to + * free pages here, a negative value if kswapd should leave the + * zone alone. + */ +static inline int free_plenty(struct zone_struct * zone) +{ + return free_limit(zone, VM_PLENTY); +} + +/* + * The inactive page target is the free target + 20% of (active + inactive) + * pages. + */ +static inline int zone_inactive_limit(struct zone_struct * zone, int limit) +{ + int inactive, target, inactive_base; + + inactive_base = zone->active_pages + zone->inactive_dirty_pages; + inactive_base /= INACTIVE_FACTOR; + + /* GCC should optimise this away completely. */ + if (limit == VM_MIN) + target = zone->pages_high + inactive_base / 2; + else if (limit == VM_LOW) + target = zone->pages_high + inactive_base; + else + target = zone->pages_high + inactive_base * 2; + + inactive = zone->free_pages + zone->inactive_clean_pages; + inactive += zone->inactive_dirty_pages; + + return target - inactive; +} + +static inline int inactive_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_inactive_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_inactive_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_inactive_limit(zone, limit); + } + + return shortage; +} + +/** + * inactive_min - test for serious shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no serious shortage of (free + inactive clean) pages + */ +static inline int inactive_min(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_MIN); +} + +/** + * inactive_low - test for shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no shortage of (free + inactive clean) pages + */ +static inline int inactive_low(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_LOW); +} + +/** + * inactive_high - less than ideal amount of (free + inactive) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have more than enough (free + inactive) pages + */ +static inline int inactive_high(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_HIGH); +} + +/* + * inactive_target - number of inactive pages we ought to have. + */ +static inline int inactive_target(void) +{ + int target; + + target = nr_active_pages + nr_inactive_dirty_pages + + nr_inactive_clean_pages; + + target /= INACTIVE_FACTOR; + + return target; +} + +/* + * Called whenever the VM references a page. We immediately reclaim + * the inactive clean pages because those are counted as freeable. + * We don't modify the inactive dirty ones because we're never sure + * if those are freeable anyway. + */ +static inline void touch_page(struct page * page) +{ + if (PageInactiveClean(page)) { + struct zone_struct * zone = page_zone(page); + int free = zone->free_pages + zone->inactive_clean_pages; + activate_page(page); + if (free < zone->pages_low) + wakeup_kswapd(GFP_NOIO); + if (zone->free_pages < zone->pages_min) + fixup_freespace(zone, 1); + } else + SetPageReferenced(page); +} + +#endif diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Tue Apr 9 17:36:56 2002 +++ b/include/linux/mmzone.h Tue Apr 9 17:36:56 2002 @@ -40,12 +40,18 @@ */ spinlock_t lock; unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; + unsigned long active_pages; + unsigned long inactive_dirty_pages; + unsigned long inactive_clean_pages; + unsigned long pages_min, pages_low, pages_high, pages_plenty; int need_balance; /* * free areas of different sizes */ + struct list_head active_list; + struct list_head inactive_dirty_list; + struct list_head inactive_clean_list; free_area_t free_area[MAX_ORDER]; /* @@ -143,9 +149,6 @@ extern int numnodes; extern pg_data_t *pgdat_list; -#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \ - && ((pgzone) <= (classzone))) - /* * The following two are not meant for general usage. They are here as * prototypes for the discontig memory code. @@ -157,6 +160,60 @@ struct page *pmap); extern pg_data_t contig_page_data; + +/** + * for_each_pgdat - helper macro to iterate over all nodes + * @pgdat - pg_data_t * variable + * + * Meant to help with common loops of the form + * pgdat = pgdat_list; + * while(pgdat) { + * ... + * pgdat = pgdat->node_next; + * } + */ +#define for_each_pgdat(pgdat) \ + for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) + + +/* + * next_zone - helper magic for for_each_zone() + * Thanks to William Lee Irwin III for this piece of ingenuity. + */ +static inline zone_t *next_zone(zone_t *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone - pgdat->node_zones < MAX_NR_ZONES - 1) + zone++; + + else if (pgdat->node_next) { + pgdat = pgdat->node_next; + zone = pgdat->node_zones; + } else + zone = NULL; + + return zone; +} + +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - zone_t * variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. This basically means for_each_zone() is an + * easier to read version of this piece of code: + * + * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) + * for(i = 0; i < MAX_NR_ZONES; ++i) { + * zone_t * z = pgdat->node_zones + i; + * ... + * } + * } + */ +#define for_each_zone(zone) \ + for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) + #ifndef CONFIG_DISCONTIGMEM diff -Nru a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h Tue Apr 9 17:36:56 2002 +++ b/include/linux/sched.h Tue Apr 9 17:36:56 2002 @@ -227,7 +227,7 @@ unsigned long rss, total_vm, locked_vm; unsigned long def_flags; unsigned long cpu_vm_mask; - unsigned long swap_address; + unsigned long rlimit_rss; unsigned dumpable:1; @@ -246,6 +246,7 @@ mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ page_table_lock: SPIN_LOCK_UNLOCKED, \ mmlist: LIST_HEAD_INIT(name.mmlist), \ + rlimit_rss: RLIM_INFINITY, \ } struct signal_struct { @@ -327,8 +328,6 @@ struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; /* task state */ struct linux_binfmt *binfmt; diff -Nru a/include/linux/swap.h b/include/linux/swap.h --- a/include/linux/swap.h Tue Apr 9 17:36:56 2002 +++ b/include/linux/swap.h Tue Apr 9 17:36:56 2002 @@ -86,8 +86,8 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_buffer_pages(void); extern int nr_active_pages; -extern int nr_inactive_pages; -extern atomic_t nr_async_pages; +extern int nr_inactive_dirty_pages; +extern int nr_inactive_clean_pages; extern atomic_t page_cache_size; extern atomic_t buffermem_pages; extern spinlock_t pagecache_lock; @@ -100,18 +100,39 @@ struct zone_t; +/* linux/mm/rmap.c */ +extern int FASTCALL(page_referenced(struct page *)); +extern void FASTCALL(page_add_rmap(struct page *, pte_t *)); +extern void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +extern int FASTCALL(try_to_unmap(struct page *)); +extern int FASTCALL(page_over_rsslimit(struct page *)); + +/* return values of try_to_unmap */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 +#define SWAP_ERROR 3 + /* linux/mm/swap.c */ +extern int total_swap_pages; extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(__lru_cache_del(struct page *)); extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(activate_page_nolock(struct page *)); +extern void FASTCALL(deactivate_page(struct page *)); +extern void FASTCALL(deactivate_page_nolock(struct page *)); +extern void FASTCALL(drop_page(struct page *)); extern void swap_setup(void); /* linux/mm/vmscan.c */ +extern struct page * FASTCALL(reclaim_page(zone_t *)); extern wait_queue_head_t kswapd_wait; -extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask)); +extern void wakeup_kswapd(unsigned int); +extern void rss_free_pages(unsigned int); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -125,6 +146,7 @@ extern void show_swap_cache_info(void); #endif extern int add_to_swap_cache(struct page *, swp_entry_t); +extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *page); extern void delete_from_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *page); @@ -158,7 +180,14 @@ extern spinlock_t pagemap_lru_lock; -extern void FASTCALL(mark_page_accessed(struct page *)); +/* + * Page aging defines. These seem to work great in FreeBSD, + * no need to reinvent the wheel. + */ +#define PAGE_AGE_START 5 +#define PAGE_AGE_ADV 3 +#define PAGE_AGE_DECL 1 +#define PAGE_AGE_MAX 64 /* * List add/del helper macros. These must be called @@ -166,38 +195,12 @@ */ #define DEBUG_LRU_PAGE(page) \ do { \ - if (!PageLRU(page)) \ - BUG(); \ if (PageActive(page)) \ BUG(); \ -} while (0) - -#define add_page_to_active_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ -} while (0) - -#define add_page_to_inactive_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ -} while (0) - -#define del_page_from_active_list(page) \ -do { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - nr_active_pages--; \ -} while (0) - -#define del_page_from_inactive_list(page) \ -do { \ - list_del(&(page)->lru); \ - nr_inactive_pages--; \ + if (PageInactiveDirty(page)) \ + BUG(); \ + if (PageInactiveClean(page)) \ + BUG(); \ } while (0) extern spinlock_t swaplock; diff -Nru a/include/linux/swapctl.h b/include/linux/swapctl.h --- a/include/linux/swapctl.h Tue Apr 9 17:36:56 2002 +++ b/include/linux/swapctl.h Tue Apr 9 17:36:56 2002 @@ -10,4 +10,13 @@ typedef pager_daemon_v1 pager_daemon_t; extern pager_daemon_t pager_daemon; +typedef struct freepages_v1 +{ + unsigned int min; + unsigned int low; + unsigned int high; +} freepages_v1; +typedef freepages_v1 freepages_t; +extern freepages_t freepages; + #endif /* _LINUX_SWAPCTL_H */ diff -Nru a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c Tue Apr 9 17:36:56 2002 +++ b/kernel/fork.c Tue Apr 9 17:36:56 2002 @@ -140,7 +140,6 @@ mm->map_count = 0; mm->rss = 0; mm->cpu_vm_mask = 0; - mm->swap_address = 0; pprev = &mm->mmap; /* @@ -264,9 +263,6 @@ void mmput(struct mm_struct *mm) { if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { - extern struct mm_struct *swap_mm; - if (swap_mm == mm) - swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); @@ -662,8 +658,6 @@ #endif p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - - INIT_LIST_HEAD(&p->local_pages); retval = -ENOMEM; /* copy all the process information */ diff -Nru a/kernel/sys.c b/kernel/sys.c --- a/kernel/sys.c Tue Apr 9 17:36:56 2002 +++ b/kernel/sys.c Tue Apr 9 17:36:56 2002 @@ -1128,6 +1128,12 @@ if (resource == RLIMIT_NOFILE) { if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN) return -EPERM; + } else if (resource == RLIMIT_RSS && current->mm) { + /* rlimit is specified in bytes, convert to pages */ + unsigned long pages = RLIM_INFINITY; + if (new_rlim.rlim_cur != RLIM_INFINITY) + pages = new_rlim.rlim_cur >> PAGE_SHIFT; + current->mm->rlimit_rss = pages; } *old_rlim = new_rlim; return 0; diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c --- a/kernel/sysctl.c Tue Apr 9 17:36:56 2002 +++ b/kernel/sysctl.c Tue Apr 9 17:36:56 2002 @@ -260,6 +260,8 @@ }; static ctl_table vm_table[] = { + {VM_FREEPG, "freepages", + &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, diff -Nru a/mm/Makefile b/mm/Makefile --- a/mm/Makefile Tue Apr 9 17:36:56 2002 +++ b/mm/Makefile Tue Apr 9 17:36:56 2002 @@ -14,7 +14,7 @@ obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o rmap.o obj-$(CONFIG_HIGHMEM) += highmem.o diff -Nru a/mm/TODO b/mm/TODO --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/TODO Tue Apr 9 17:36:56 2002 @@ -0,0 +1,38 @@ + VM TODO list + +Forever valid TODO entries: + - keep up with the official kernel + - port over bugfixes + - minimise the diff by keeping code in sync where possible + +Easy short-term features: + - reclaim swap space from refill_inactive() + - simplify SMP locking + - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with + one single function using a for_each_pte() macro + for_each_pte(ptep, mm, start_address, end_address) + - fix page_launder() to not eat horrible amounts of CPU or flush + all pages to disk at once + - better VM balancing, clean vs. dirty ratio + - fix loopback device deadlock + riel: nr_fract=70%, nr_fract_sync=80% + riel: setup a loopback fs ext2-on-ext2 + riel: boot with mem=64m + riel: then write a 500 meg file. + riel: current kernel livelocks. + - stabilise pte_highmem and integrate it with rmap + - page_cache_size per zone + - pte_chain list per zone + - get rid of other global structures/stats, make them per zone + +Long-term features: + - extensive VM statistics + - IO clustering for page_launder() and sync_old_buffers() + - readahead on per-VMA level (+ drop behind?) + - more graceful degradation when the load gets high + - reducing readahead + - unfair pageout so not all apps fall over + - memory objects, using pagecache and tmpfs for storage so + the memory object itself doesn't introduce any new overhead + - using the memory objects, removing page table copying from fork() + - load control able to deal with really extreme loads, swapping diff -Nru a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c Tue Apr 9 17:36:56 2002 +++ b/mm/bootmem.c Tue Apr 9 17:36:56 2002 @@ -326,12 +326,11 @@ pg_data_t *pgdat = pgdat_list; void *ptr; - while (pgdat) { + for_each_pgdat(pgdat) if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal))) return(ptr); - pgdat = pgdat->node_next; - } + /* * Whoops, we cannot satisfy the allocation request. */ diff -Nru a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c Tue Apr 9 17:36:56 2002 +++ b/mm/filemap.c Tue Apr 9 17:36:56 2002 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -233,7 +234,7 @@ static void truncate_complete_page(struct page *page) { /* Leave it on the LRU if it gets converted into anonymous buffers */ - if (!page->buffers || do_flushpage(page, 0)) + if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0))) lru_cache_del(page); /* @@ -453,6 +454,11 @@ return page; } +static struct page * __find_page(struct address_space * mapping, unsigned long index) +{ + return __find_page_nolock(mapping, index, *page_hash(mapping,index)); +} + static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) { struct list_head *curr; @@ -1009,7 +1015,53 @@ /* - * Same as grab_cache_page, but do not wait if the page is unavailable. + * We combine this with read-ahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * TODO: + * - make the readahead code smarter + * - move readahead to the VMA level so we can do the same + * trick with mmap() + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long start; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + if (index > file->f_rawin) + start = index - file->f_rawin; + else + start = 0; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + spin_lock(&pagemap_lru_lock); + while (--index >= start) { + spin_lock(&pagecache_lock); + page = __find_page(mapping, index); + spin_unlock(&pagecache_lock); + if (!page || !PageActive(page)) + break; + drop_page(page); + } + spin_unlock(&pagemap_lru_lock); +} + +/* Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. @@ -1279,6 +1331,12 @@ if (filp->f_ramax > max_readahead) filp->f_ramax = max_readahead; + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(filp, index); + #ifdef PROFILE_READAHEAD profile_readahead((reada_ok == 2), filp); #endif @@ -1287,25 +1345,6 @@ return; } -/* - * Mark a page as having seen activity. - * - * If it was already so marked, move it - * to the active queue and drop the referenced - * bit. Otherwise, just mark it for future - * action.. - */ -void mark_page_accessed(struct page *page) -{ - if (!PageActive(page) && PageReferenced(page)) { - activate_page(page); - ClearPageReferenced(page); - return; - } - - /* Mark the page referenced, AFTER checking for previous usage.. */ - SetPageReferenced(page); -} /* * This is a generic file read routine, and uses the @@ -1414,7 +1453,7 @@ * beginning or we just did an lseek. */ if (!offset || !filp->f_reada) - mark_page_accessed(page); + touch_page(page); /* * Ok, we have the page, and it's up-to-date, so @@ -1815,7 +1854,7 @@ nr = max; /* And limit it to a sane percentage of the inactive list.. */ - max = nr_inactive_pages / 2; + max = nr_inactive_clean_pages / 2; if (nr > max) nr = max; @@ -1960,7 +1999,7 @@ * Found the page and have a reference on it, need to check sharing * and possibly copy it over to another page.. */ - mark_page_accessed(page); + touch_page(page); flush_page_to_ram(page); return page; @@ -2839,7 +2878,7 @@ page = __read_cache_page(mapping, index, filler, data); if (IS_ERR(page)) goto out; - mark_page_accessed(page); + touch_page(page); if (Page_Uptodate(page)) goto out; @@ -3036,6 +3075,7 @@ unsigned long index, offset; long page_fault; char *kaddr; + int deactivate = 1; /* * Try to find the page in the cache. If it isn't there, @@ -3044,8 +3084,10 @@ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + deactivate = 0; + } /* * Bring in the user page that we will copy from _first_. @@ -3089,8 +3131,11 @@ unlock: kunmap(page); /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); + if (deactivate) + deactivate_page(page); + else + touch_page(page); page_cache_release(page); if (status < 0) diff -Nru a/mm/memory.c b/mm/memory.c --- a/mm/memory.c Tue Apr 9 17:36:56 2002 +++ b/mm/memory.c Tue Apr 9 17:36:56 2002 @@ -45,8 +45,10 @@ #include #include #include +#include #include +#include #include #include @@ -103,6 +105,7 @@ } pte = pte_offset(dir, 0); pmd_clear(dir); + pgtable_remove_rmap(pte); pte_free(pte); } @@ -237,9 +240,11 @@ if (pte_none(pte)) goto cont_copy_pte_range_noset; + /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || @@ -247,7 +252,7 @@ goto cont_copy_pte_range; /* If it's a COW mapping, write protect it both in the parent and the child */ - if (cow && pte_write(pte)) { + if (cow) { ptep_set_wrprotect(src_pte); pte = *src_pte; } @@ -260,6 +265,7 @@ dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); + page_add_rmap(ptepage, dst_pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out_unlock; @@ -315,8 +321,10 @@ continue; if (pte_present(pte)) { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && !PageReserved(page)) + if (VALID_PAGE(page) && !PageReserved(page)) { freed ++; + page_remove_rmap(page, ptep); + } /* This will eventually call __free_pte on the pte. */ tlb_remove_page(tlb, ptep, address + offset); } else { @@ -981,7 +989,9 @@ if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; + page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + page_add_rmap(new_page, page_table); lru_cache_add(new_page); /* Free the old page.. */ @@ -1094,6 +1104,10 @@ struct page *new_page; unsigned long offset; + /* Low on free memory ? Don't make things worse. */ + if (free_low(ALL_ZONES) < 0) + return; + /* * Get the number of handles we should do readahead io to. */ @@ -1142,7 +1156,7 @@ ret = 2; } - mark_page_accessed(page); + touch_page(page); lock_page(page); @@ -1173,6 +1187,7 @@ flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + page_add_rmap(page, page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1188,14 +1203,13 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { pte_t entry; + struct page * page = ZERO_PAGE(addr); /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { - struct page *page; - /* Allocate our own private page. */ spin_unlock(&mm->page_table_lock); @@ -1214,10 +1228,10 @@ flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); - mark_page_accessed(page); } set_pte(page_table, entry); + page_add_rmap(page, page_table); /* ignores ZERO_PAGE */ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); @@ -1272,6 +1286,8 @@ new_page = page; } + touch_page(new_page); + spin_lock(&mm->page_table_lock); /* * This silly early PAGE_DIRTY setting removes a race @@ -1292,6 +1308,7 @@ if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); + page_add_rmap(new_page, page_table); } else { /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); @@ -1368,6 +1385,14 @@ current->state = TASK_RUNNING; pgd = pgd_offset(mm, address); + /* + * If we are over our RSS limit and the system needs memory, + * we will free memory for the non-hogs and slow down a bit. + */ + if (mm->rlimit_rss && mm->rss > mm->rlimit_rss && + free_high(ALL_ZONES) > 0) + rss_free_pages(GFP_HIGHUSER); + /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. @@ -1449,6 +1474,7 @@ goto out; } } + pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } out: diff -Nru a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c Tue Apr 9 17:36:56 2002 +++ b/mm/mmap.c Tue Apr 9 17:36:56 2002 @@ -14,7 +14,6 @@ #include #include #include -#include #include #include diff -Nru a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c Tue Apr 9 17:36:56 2002 +++ b/mm/mremap.c Tue Apr 9 17:36:56 2002 @@ -61,8 +61,14 @@ { int error = 0; pte_t pte; + struct page * page = NULL; + + if (pte_present(*src)) + page = pte_page(*src); if (!pte_none(*src)) { + if (page) + page_remove_rmap(page, src); pte = ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -70,6 +76,8 @@ error++; } set_pte(dst, pte); + if (page) + page_add_rmap(page, dst); } return error; } diff -Nru a/mm/oom_kill.c b/mm/oom_kill.c --- a/mm/oom_kill.c Tue Apr 9 17:36:56 2002 +++ b/mm/oom_kill.c Tue Apr 9 17:36:56 2002 @@ -110,8 +110,7 @@ /* * Simple selection loop. We chose the process with the highest - * number of 'points'. We need the locks to make sure that the - * list of task structs doesn't change while we look the other way. + * number of 'points'. We expect the caller will lock the tasklist. * * (not docbooked, we don't want this one cluttering up the manual) */ @@ -121,7 +120,6 @@ struct task_struct *p = NULL; struct task_struct *chosen = NULL; - read_lock(&tasklist_lock); for_each_task(p) { if (p->pid) { int points = badness(p); @@ -131,7 +129,6 @@ } } } - read_unlock(&tasklist_lock); return chosen; } @@ -170,19 +167,25 @@ */ static void oom_kill(void) { - struct task_struct *p = select_bad_process(), *q; + struct task_struct *p, *q; + extern wait_queue_head_t kswapd_done; + + read_lock(&tasklist_lock); + p = select_bad_process(); /* Found nothing?!?! Either we hang forever, or we panic. */ if (p == NULL) panic("Out of memory and no killable processes...\n"); /* kill all processes that share the ->mm (i.e. all threads) */ - read_lock(&tasklist_lock); for_each_task(q) { if(q->mm == p->mm) oom_kill_task(q); } read_unlock(&tasklist_lock); + /* Chances are by this time our victim is sleeping on kswapd. */ + wake_up(&kswapd_done); + /* * Make kswapd go out of the way, so "p" has a good chance of * killing itself before someone else gets the chance to ask @@ -198,7 +201,7 @@ */ void out_of_memory(void) { - static unsigned long first, last, count; + static unsigned long first, last, count, lastkill; unsigned long now, since; /* @@ -235,8 +238,18 @@ return; /* + * If we just killed a process, wait a while + * to give that task a chance to exit. This + * avoids killing multiple processes needlessly. + */ + since = now - lastkill; + if (since < HZ*5) + return; + + /* * Ok, really out of memory. Kill something. */ + lastkill = now; oom_kill(); reset: diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Tue Apr 9 17:36:56 2002 +++ b/mm/page_alloc.c Tue Apr 9 17:36:56 2002 @@ -21,12 +21,12 @@ #include #include #include +#include int nr_swap_pages; int nr_active_pages; -int nr_inactive_pages; -struct list_head inactive_list; -struct list_head active_list; +int nr_inactive_dirty_pages; +int nr_inactive_clean_pages; pg_data_t *pgdat_list; /* Used to look up the address of the struct zone encoded in page->zone */ @@ -37,6 +37,8 @@ static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, }; +static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, }; /* * Free_page() adds the page to the free lists. This is optimized for @@ -112,16 +114,17 @@ BUG(); if (PageLocked(page)) BUG(); - if (PageLRU(page)) - BUG(); if (PageActive(page)) BUG(); + if (PageInactiveDirty(page)) + BUG(); + if (PageInactiveClean(page)) + BUG(); + if (page->pte_chain) + BUG(); page->flags &= ~((1<flags & PF_FREE_PAGES) - goto local_freelist; - back_local_freelist: - + page->age = PAGE_AGE_START; + zone = page_zone(page); mask = (~0UL) << order; @@ -168,17 +171,6 @@ memlist_add_head(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); - return; - - local_freelist: - if (current->nr_local_pages) - goto back_local_freelist; - if (in_interrupt()) - goto back_local_freelist; - - list_add(&page->list, ¤t->local_pages); - page->index = order; - current->nr_local_pages++; } #define MARK_USED(index, order, area) \ @@ -237,10 +229,7 @@ set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); + DEBUG_LRU_PAGE(page); return page; } curr_order++; @@ -259,78 +248,83 @@ } #endif -static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); -static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +/* + * If we are able to directly reclaim pages, we move pages from the + * inactive_clean list onto the free list until the zone has enough + * free pages or until the inactive_clean pages are exhausted. + * If we cannot do this work ourselves, call kswapd. + */ +void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim)); +void fixup_freespace(zone_t * zone, int direct_reclaim) +{ + if (direct_reclaim) { + struct page * page; + do { + if ((page = reclaim_page(zone))) + __free_pages_ok(page, 0); + } while (page && zone->free_pages <= zone->pages_min); + } else + wakeup_kswapd(GFP_ATOMIC); +} + +#define PAGES_KERNEL 0 +#define PAGES_MIN 1 +#define PAGES_LOW 2 +#define PAGES_HIGH 3 + +/* + * This function does the dirty work for __alloc_pages + * and is separated out to keep the code size smaller. + * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) + */ +static struct page * __alloc_pages_limit(zonelist_t *zonelist, + unsigned long order, int limit, int direct_reclaim) { - struct page * page = NULL; - int __freed = 0; + zone_t **zone = zonelist->zones; + unsigned long water_mark = 0; - if (!(gfp_mask & __GFP_WAIT)) - goto out; - if (in_interrupt()) - BUG(); - - current->allocation_order = order; - current->flags |= PF_MEMALLOC | PF_FREE_PAGES; - - __freed = try_to_free_pages(classzone, gfp_mask, order); - - current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - - if (current->nr_local_pages) { - struct list_head * entry, * local_pages; - struct page * tmp; - int nr_pages; - - local_pages = ¤t->local_pages; - - if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ - entry = local_pages->next; - do { - tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(page_zone(tmp), classzone)) { - list_del(entry); - current->nr_local_pages--; - set_page_count(tmp, 1); - page = tmp; - - if (page->buffers) - BUG(); - if (page->mapping) - BUG(); - if (!VALID_PAGE(page)) - BUG(); - if (PageSwapCache(page)) - BUG(); - if (PageLocked(page)) - BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); - if (PageDirty(page)) - BUG(); + for (;;) { + zone_t *z = *(zone++); - break; - } - } while ((entry = entry->next) != local_pages); + if (!z) + break; + if (!z->size) + BUG(); + + /* + * We allocate if the number of (free + inactive_clean) + * pages is above the watermark. + */ + switch (limit) { + case PAGES_KERNEL: + water_mark = z->pages_min / 2; + break; + case PAGES_MIN: + water_mark = z->pages_min; + break; + case PAGES_LOW: + water_mark = z->pages_low; + break; + default: + case PAGES_HIGH: + water_mark = z->pages_high; } - nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ - while ((entry = local_pages->prev) != local_pages) { - list_del(entry); - tmp = list_entry(entry, struct page, list); - __free_pages_ok(tmp, tmp->index); - if (!nr_pages--) - BUG(); + if (z->free_pages + z->inactive_clean_pages >= water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; } - current->nr_local_pages = 0; } - out: - *freed = __freed; - return page; + + /* Found nothing. */ + return NULL; } /* @@ -338,100 +332,248 @@ */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; - zone_t **zone, * classzone; + zone_t **zone; + int min, direct_reclaim = 0; struct page * page; - int freed; + /* + * (If anyone calls gfp from interrupts nonatomically then it + * will sooner or later tripped up by a schedule().) + * + * We fall back to lower-level zones if allocation + * in a higher zone fails. + */ + + /* + * Can we take pages directly from the inactive_clean + * list? + */ + if (order == 0 && (gfp_mask & __GFP_WAIT)) + direct_reclaim = 1; + +try_again: + /* + * First, see if we have any zones with lots of free memory. + * + * We allocate free memory first because it doesn't contain + * any data we would want to cache. + */ zone = zonelist->zones; - classzone = *zone; min = 1UL << order; for (;;) { zone_t *z = *(zone++); if (!z) break; + if (!z->size) + BUG(); - min += z->pages_low; + min += z->pages_min; if (z->free_pages > min) { page = rmqueue(z, order); if (page) return page; - } + } else if (z->free_pages < z->pages_min) + fixup_freespace(z, direct_reclaim); + } + + /* + * Next, try to allocate a page from a zone with a HIGH + * amount of (free + inactive_clean) pages. + * + * If there is a lot of activity, inactive_target + * will be high and we'll have a good chance of + * finding a page using the HIGH limit. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); + if (page) + return page; + + /* + * Then try to allocate a page from a zone with more + * than zone->pages_low of (free + inactive_clean) pages. + * + * When the working set is very large and VM activity + * is low, we're most likely to have our allocation + * succeed here. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); + if (page) + return page; + + /* + * OK, none of the zones on our zonelist has lots + * of pages free. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. + * + * We'll also help a bit trying to free pages, this + * way statistics will make sure really fast allocators + * are slowed down more than slow allocators and other + * programs in the system shouldn't be impacted as much + * by the hogs. + */ + wakeup_kswapd(gfp_mask); + + /* + * After waking up kswapd, we try to allocate a page + * from any zone which isn't critical yet. + * + * Kswapd should, in most situations, bring the situation + * back to normal in no time. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + if (page) + return page; + + /* + * Kernel allocations can eat a few emergency pages. + * We should be able to run without this, find out why + * the SCSI layer isn't happy ... + */ + if (gfp_mask & __GFP_HIGH) { + page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, direct_reclaim); + if (page) + return page; } - classzone->need_balance = 1; - mb(); - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + /* + * Oh well, we didn't succeed. + */ + if (!(current->flags & PF_MEMALLOC)) { + /* + * Are we dealing with a higher order allocation? + * + * If so, try to defragment some memory. + */ + if (order > 0 && (gfp_mask & __GFP_WAIT)) + goto defragment; + + /* + * If we arrive here, we are really tight on memory. + * Since kswapd didn't succeed in freeing pages for us, + * we need to help it. + * + * Single page allocs loop until the allocation succeeds. + * Multi-page allocs can fail due to memory fragmentation; + * in that case we bail out to prevent infinite loops and + * hanging device drivers ... + * + * Another issue are GFP_NOFS allocations; because they + * do not have __GFP_FS set it's possible we cannot make + * any progress freeing pages, in that case it's better + * to give up than to deadlock the kernel looping here. + * + * NFS: we must yield the CPU (to rpciod) to avoid deadlock. + */ + if (gfp_mask & __GFP_WAIT) { + __set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); + if (!order || free_high(ALL_ZONES) >= 0) { + int progress = try_to_free_pages(gfp_mask); + if (progress || (gfp_mask & __GFP_FS)) + goto try_again; + /* + * Fail if no progress was made and the + * allocation may not be able to block on IO. + */ + return NULL; + } + } + } + /* + * Final phase: allocate anything we can! + * + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... + */ zone = zonelist->zones; min = 1UL << order; for (;;) { - unsigned long local_min; zone_t *z = *(zone++); + struct page * page = NULL; if (!z) break; - local_min = z->pages_min; - if (!(gfp_mask & __GFP_WAIT)) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * death. + */ + if (direct_reclaim) { + page = reclaim_page(z); + if (page) + return page; + } + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + min += z->pages_min / 4; + if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) { page = rmqueue(z, order); if (page) return page; } } + goto out_failed; - /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { + /* + * Naive "defragmentation" for higher-order allocations. First we + * free the inactive_clean pages to see if we can allocate our + * allocation, then we call page_launder() to clean some dirty + * pages, and last we try once more. + * + * We might want to turn this into something which defragments + * memory based on physical page, simply by looking for unmapped + * pages next to pages on the free list... + */ +defragment: + { + int freed = 0; +defragment_again: zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); if (!z) break; - - page = rmqueue(z, order); - if (page) - return page; + if (!z->size) + continue; + while (z->inactive_clean_pages) { + struct page * page; + /* Move one page to the free list. */ + page = reclaim_page(z); + if (!page) + break; + __free_page(page); + /* Try if the allocation succeeds. */ + page = rmqueue(z, order); + if (page) + return page; + } } - return NULL; - } - - /* Atomic allocations - we can't balance anything */ - if (!(gfp_mask & __GFP_WAIT)) - return NULL; - - page = balance_classzone(classzone, gfp_mask, order, &freed); - if (page) - return page; - - zone = zonelist->zones; - min = 1UL << order; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - min += z->pages_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* XXX: do real defragmentation instead of calling launder ? */ + if (!freed) { + freed = 1; + current->flags |= PF_MEMALLOC; + try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; + goto defragment_again; } } - /* Don't let big-order allocations loop */ - if (order > 3) - return NULL; - - /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; - __set_current_state(TASK_RUNNING); - schedule(); - goto rebalance; + +out_failed: + /* No luck.. */ +// printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order); + return NULL; } /* @@ -479,14 +621,11 @@ { unsigned int sum; zone_t *zone; - pg_data_t *pgdat = pgdat_list; sum = 0; - while (pgdat) { - for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) - sum += zone->free_pages; - pgdat = pgdat->node_next; - } + for_each_zone(zone) + sum += zone->free_pages; + return sum; } @@ -495,23 +634,21 @@ */ unsigned int nr_free_buffer_pages (void) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; unsigned int sum = 0; - do { + for_each_pgdat(pgdat) { zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); zone_t **zonep = zonelist->zones; zone_t *zone; for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->size; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; + sum += zone->free_pages; + sum += zone->inactive_clean_pages; + sum += zone->inactive_dirty_pages; } - pgdat = pgdat->node_next; - } while (pgdat); + } return sum; } @@ -519,13 +656,12 @@ #if CONFIG_HIGHMEM unsigned int nr_free_highpages (void) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; unsigned int pages = 0; - while (pgdat) { + for_each_pgdat(pgdat) pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; - pgdat = pgdat->node_next; - } + return pages; } #endif @@ -562,10 +698,18 @@ tmpdat = tmpdat->node_next; } - printk("( Active: %d, inactive: %d, free: %d )\n", - nr_active_pages, - nr_inactive_pages, - nr_free_pages()); + printk("Free pages: %6dkB (%6dkB HighMem)\n", + nr_free_pages() << (PAGE_SHIFT-10), + nr_free_highpages() << (PAGE_SHIFT-10)); + + printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n", + nr_active_pages, + nr_inactive_dirty_pages, + nr_inactive_clean_pages, + nr_free_pages(), + freepages.min, + freepages.low, + freepages.high); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; @@ -725,9 +869,6 @@ printk("On node %d totalpages: %lu\n", nid, realtotalpages); - INIT_LIST_HEAD(&active_list); - INIT_LIST_HEAD(&inactive_list); - /* * Some architectures (with lots of mem and discontinous memory * maps) have to search for a good mem_map area: @@ -750,7 +891,7 @@ offset = lmem_map - mem_map; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; - unsigned long mask; + unsigned long mask, extrafree = 0; unsigned long size, realsize; zone_table[nid * MAX_NR_ZONES + j] = zone; @@ -764,7 +905,13 @@ zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; + zone->inactive_clean_pages = 0; + zone->inactive_dirty_pages = 0; zone->need_balance = 0; + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_dirty_list); + INIT_LIST_HEAD(&zone->inactive_clean_list); + if (!size) continue; @@ -784,15 +931,36 @@ pgdat->nr_zones = j+1; + /* + * On large memory machines we keep extra memory + * free for kernel allocations. + */ + if (zone_extrafree_ratio[j]) + extrafree = min_t(int, (realtotalpages / zone_extrafree_ratio[j]), zone_extrafree_max[j]); + if (extrafree < zone_balance_max[j]) + extrafree = 0; + mask = (realsize / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; - else if (mask > zone_balance_max[j]) - mask = zone_balance_max[j]; - zone->pages_min = mask; - zone->pages_low = mask*2; - zone->pages_high = mask*3; - + zone->pages_min = extrafree + min(mask, (unsigned long)zone_balance_max[j]); + zone->pages_low = extrafree + mask*2; + zone->pages_high = extrafree + mask*3; + zone->pages_plenty = extrafree + mask*6; + /* + * Add these free targets to the global free target; + * we have to be SURE that freepages.high is higher + * than SUM [zone->pages_min] for all zones, otherwise + * we may have bad bad problems. + * + * This means we cannot make the freepages array writable + * in /proc, but have to add a separate extra_free_target + * for people who require it to catch load spikes in eg. + * gigabit ethernet routing... + */ + freepages.min += zone->pages_min; + freepages.low += zone->pages_low; + freepages.high += zone->pages_high; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; diff -Nru a/mm/rmap.c b/mm/rmap.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/rmap.c Tue Apr 9 17:36:56 2002 @@ -0,0 +1,394 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * + * Simple, low overhead pte-based reverse mapping scheme. + * This is kept modular because we may want to experiment + * with object-based reverse mapping schemes. Please try + * to keep this thing as modular as possible. + */ + +/* + * Locking: + * - the page->pte_chain is protected by the pagemap_lru_lock, + * we probably want to change this to a per-page lock in the + * future + * - because swapout locking is opposite to the locking order + * in the page fault path, the swapout path uses trylocks + * on the mm->page_table_lock + */ +#include +#include +#include + +#include +#include +#include + +/* #define DEBUG_RMAP */ + +/* + * Shared pages have a chain of pte_chain structures, used to locate + * all the mappings to this page. We only need a pointer to the pte + * here, the page struct for the page table page contains the process + * it belongs to and the offset within that process. + * + * A singly linked list should be fine for most, if not all, workloads. + * On fork-after-exec the mapping we'll be removing will still be near + * the start of the list, on mixed application systems the short-lived + * processes will have their mappings near the start of the list and + * in systems with long-lived applications the relative overhead of + * exit() will be lower since the applications are long-lived. + */ +struct pte_chain { + struct pte_chain * next; + pte_t * ptep; +}; + +static struct pte_chain * pte_chain_freelist; +static inline struct pte_chain * pte_chain_alloc(void); +static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *); +static void alloc_new_pte_chains(void); + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * Caller needs to hold the pagemap_lru_lock. + */ +int FASTCALL(page_referenced(struct page *)); +int page_referenced(struct page * page) +{ + struct pte_chain * pc; + int referenced = 0; + + if (PageTestandClearReferenced(page)) + referenced++; + + /* Check all the page tables mapping this page. */ + for (pc = page->pte_chain; pc; pc = pc->next) { + if (ptep_test_and_clear_young(pc->ptep)) + referenced++; + } + + return referenced; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @ptep: the page table entry mapping this page + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +void FASTCALL(page_add_rmap(struct page *, pte_t *)); +void page_add_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pte_chain; + + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + spin_lock(&pagemap_lru_lock); +#ifdef DEBUG_RMAP + if (!page || !ptep) + BUG(); + if (!pte_present(*ptep)) + BUG(); + if (!ptep_to_mm(ptep)); + BUG(); + { + struct pte_chain * pc; + for (pc = page->pte_chain; pc; pc = pc->next) { + if (pc->ptep == ptep) + BUG(); + } + } +#endif + pte_chain = pte_chain_alloc(); + + /* Hook up the pte_chain to the page. */ + pte_chain->ptep = ptep; + pte_chain->next = page->pte_chain; + page->pte_chain = pte_chain; + + spin_unlock(&pagemap_lru_lock); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * @ptep: page table entry to remove + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + * Caller needs to hold the mm->page_table_lock. + */ +void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +void page_remove_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pc, * prev_pc = NULL; + + if (!page || !ptep) + BUG(); + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + spin_lock(&pagemap_lru_lock); + for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) { + if (pc->ptep == ptep) { + pte_chain_free(pc, prev_pc, page); + goto out; + } + } +#ifdef DEBUG_RMAP + /* Not found. This should NEVER happen! */ + printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep); + printk(KERN_ERR "page_remove_rmap: only found: "); + for (pc = page->pte_chain; pc; pc = pc->next) + printk("%p ", pc->ptep); + printk("\n"); + printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n"); +#endif + +out: + spin_unlock(&pagemap_lru_lock); + return; + +} + +/** + * try_to_unmap_one - worker function for try_to_unmap + * @page: page to unmap + * @ptep: page table entry to unmap from page + * + * Internal helper function for try_to_unmap, called for each page + * table entry mapping a page. Because locking order here is opposite + * to the locking order used by the page fault path, we use trylocks. + * Locking: + * pagemap_lru_lock page_launder() + * page lock page_launder(), trylock + * mm->page_table_lock try_to_unmap_one(), trylock + */ +int FASTCALL(try_to_unmap_one(struct page *, pte_t *)); +int try_to_unmap_one(struct page * page, pte_t * ptep) +{ + unsigned long address = ptep_to_address(ptep); + struct mm_struct * mm = ptep_to_mm(ptep); + struct vm_area_struct * vma; + pte_t pte; + int ret; + + if (!mm) + BUG(); + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) + return SWAP_AGAIN; + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* Nuke the page table entry. */ + pte = ptep_get_and_clear(ptep); + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + + /* Store the swap location in the pte. See handle_pte_fault() ... */ + if (PageSwapCache(page)) { + swp_entry_t entry; + entry.val = page->index; + swap_duplicate(entry); + set_pte(ptep, swp_entry_to_pte(entry)); + } + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pte)) + set_page_dirty(page); + + mm->rss--; + page_cache_release(page); + ret = SWAP_SUCCESS; + +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold pagemap_lru_lock + * and the page lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + * SWAP_ERROR - an error occurred + */ +int FASTCALL(try_to_unmap(struct page *)); +int try_to_unmap(struct page * page) +{ + struct pte_chain * pc, * next_pc, * prev_pc = NULL; + int ret = SWAP_SUCCESS; + + /* This page should not be on the pageout lists. */ + if (!VALID_PAGE(page) || PageReserved(page)) + BUG(); + if (!PageLocked(page)) + BUG(); + /* We need backing store to swap out a page. */ + if (!page->mapping) + BUG(); + + for (pc = page->pte_chain; pc; pc = next_pc) { + next_pc = pc->next; + switch (try_to_unmap_one(page, pc->ptep)) { + case SWAP_SUCCESS: + /* Free the pte_chain struct. */ + pte_chain_free(pc, prev_pc, page); + break; + case SWAP_AGAIN: + /* Skip this pte, remembering status. */ + prev_pc = pc; + ret = SWAP_AGAIN; + continue; + case SWAP_FAIL: + return SWAP_FAIL; + case SWAP_ERROR: + return SWAP_ERROR; + } + } + + return ret; +} + +/** + * page_over_rsslimit - test if the page is over its RSS limit + * @page - page to test + * + * This function returns true if the process owning this page + * is over its RSS (resident set size) limit. For shared pages + * we penalise it only if all processes using it are over their + * rss limits. + * The caller needs to hold the pagemap_lru_lock. + */ +int FASTCALL(page_over_rsslimit(struct page *)); +int page_over_rsslimit(struct page * page) +{ + struct pte_chain * pte_chain = page->pte_chain; + struct mm_struct * mm; + pte_t * ptep; + + /* No process is using the page. */ + if (!pte_chain) + return 0; + + do { + ptep = pte_chain->ptep; + mm = ptep_to_mm(ptep); + + /* + * If the process is under its RSS limit, stop + * scanning and don't penalise the page. + */ + if(!mm->rlimit_rss || mm->rss <= mm->rlimit_rss) + return 0; + + pte_chain = pte_chain->next; + } while (pte_chain); + + return 1; +} + +/** + * pte_chain_free - free pte_chain structure + * @pte_chain: pte_chain struct to free + * @prev_pte_chain: previous pte_chain on the list (may be NULL) + * @page: page this pte_chain hangs off (may be NULL) + * + * This function unlinks pte_chain from the singly linked list it + * may be on and adds the pte_chain to the free list. May also be + * called for new pte_chain structures which aren't on any list yet. + * Caller needs to hold the pagemap_lru_list. + */ +static inline void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page) +{ + if (prev_pte_chain) + prev_pte_chain->next = pte_chain->next; + else if (page) + page->pte_chain = pte_chain->next; + + pte_chain->ptep = NULL; + pte_chain->next = pte_chain_freelist; + pte_chain_freelist = pte_chain; +} + +/** + * pte_chain_alloc - allocate a pte_chain struct + * + * Returns a pointer to a fresh pte_chain structure. Allocates new + * pte_chain structures as required. + * Caller needs to hold the pagemap_lru_lock. + */ +static inline struct pte_chain * pte_chain_alloc(void) +{ + struct pte_chain * pte_chain; + + /* Allocate new pte_chain structs as needed. */ + if (!pte_chain_freelist) + alloc_new_pte_chains(); + + /* Grab the first pte_chain from the freelist. */ + pte_chain = pte_chain_freelist; + pte_chain_freelist = pte_chain->next; + pte_chain->next = NULL; + + return pte_chain; +} + +/** + * alloc_new_pte_chains - convert a free page to pte_chain structures + * + * Grabs a free page and converts it to pte_chain structures. We really + * should pre-allocate these earlier in the pagefault path or come up + * with some other trick. + * + * Note that we cannot use the slab cache because the pte_chain structure + * is way smaller than the minimum size of a slab cache allocation. + */ +static void alloc_new_pte_chains(void) +{ + struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC); + int i = PAGE_SIZE / sizeof(struct pte_chain); + + if (pte_chain) { + for (; i-- > 0; pte_chain++) + pte_chain_free(pte_chain, NULL, NULL); + } else { + /* Yeah yeah, I'll fix the pte_chain allocation ... */ + panic("Fix pte_chain allocation, you lazy bastard!\n"); + } +} diff -Nru a/mm/slab.c b/mm/slab.c --- a/mm/slab.c Tue Apr 9 17:36:56 2002 +++ b/mm/slab.c Tue Apr 9 17:36:56 2002 @@ -909,14 +909,13 @@ #define drain_cpu_caches(cachep) do { } while (0) #endif -static int __kmem_cache_shrink(kmem_cache_t *cachep) +/** + * Called with the &cachep->spinlock held, returns number of slabs released + */ +static int __kmem_cache_shrink_locked(kmem_cache_t *cachep) { slab_t *slabp; - int ret; - - drain_cpu_caches(cachep); - - spin_lock_irq(&cachep->spinlock); + int ret = 0; /* If the cache is growing, stop shrinking. */ while (!cachep->growing) { @@ -935,8 +934,20 @@ spin_unlock_irq(&cachep->spinlock); kmem_slab_destroy(cachep, slabp); + ret++; spin_lock_irq(&cachep->spinlock); } + return ret; +} + +static int __kmem_cache_shrink(kmem_cache_t *cachep) +{ + int ret; + + drain_cpu_caches(cachep); + + spin_lock_irq(&cachep->spinlock); + __kmem_cache_shrink_locked(cachep); ret = !list_empty(&cachep->slabs_full) || !list_empty(&cachep->slabs_partial); spin_unlock_irq(&cachep->spinlock); return ret; @@ -947,14 +958,21 @@ * @cachep: The cache to shrink. * * Releases as many slabs as possible for a cache. - * To help debugging, a zero exit status indicates all slabs were released. + * Returns number of pages released. */ int kmem_cache_shrink(kmem_cache_t *cachep) { - if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep)) - BUG(); + int ret; - return __kmem_cache_shrink(cachep); + if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep)) + BUG(); + + drain_cpu_caches(cachep); + + spin_lock_irq(&cachep->spinlock); + ret = __kmem_cache_shrink_locked(cachep); + spin_unlock_irq(&cachep->spinlock); + return ret<<(cachep->gfporder); } /** diff -Nru a/mm/swap.c b/mm/swap.c --- a/mm/swap.c Tue Apr 9 17:36:56 2002 +++ b/mm/swap.c Tue Apr 9 17:36:56 2002 @@ -15,15 +15,29 @@ #include #include -#include #include #include #include +#include #include #include /* for copy_to/from_user */ #include +/* + * We identify three levels of free memory. We never let free mem + * fall below the freepages.min except for atomic allocations. We + * start background swapping if we fall below freepages.high free + * pages, and we begin intensive swapping below freepages.low. + * + * Actual initialization is done in mm/page_alloc.c + */ +freepages_t freepages = { + 0, /* freepages.min */ + 0, /* freepages.low */ + 0 /* freepages.high */ +}; + /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -33,17 +47,102 @@ 8, /* do swap I/O in clusters of this size */ }; +/** + * (de)activate_page - move pages from/to active and inactive lists + * @page: the page we want to move + * @nolock - are we already holding the pagemap_lru_lock? + * + * Deactivate_page will move an active page to the right + * inactive list, while activate_page will move a page back + * from one of the inactive lists to the active list. If + * called on a page which is not on any of the lists, the + * page is left alone. + */ +void FASTCALL(deactivate_page_nolock(struct page *)); +void deactivate_page_nolock(struct page * page) +{ + /* + * Don't touch it if it's not on the active list. + * (some pages aren't on any list at all) + */ + ClearPageReferenced(page); + page->age = 0; + if (PageActive(page)) { + del_page_from_active_list(page); + add_page_to_inactive_dirty_list(page); + } +} + +void FASTCALL(deactivate_page(struct page *)); +void deactivate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + deactivate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * drop_page - like deactivate_page, but try inactive_clean list + * @page: the page to drop + * + * Try to move a page to the inactive_clean list, this succeeds if the + * page is clean and not in use by anybody. If the page cannot be placed + * on the inactive_clean list it is placed on the inactive_dirty list + * instead. + * + * Note: this function gets called with the pagemap_lru_lock held. + */ +void FASTCALL(drop_page(struct page *)); +void drop_page(struct page * page) +{ + if (!TryLockPage(page)) { + if (page->mapping && page->buffers) { + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + try_to_release_page(page, GFP_NOIO); + spin_lock(&pagemap_lru_lock); + page_cache_release(page); + } + UnlockPage(page); + } + + /* Make sure the page really is reclaimable. */ + if (!page->mapping || PageDirty(page) || page->pte_chain || + page->buffers || page_count(page) > 1) + deactivate_page_nolock(page); + + else if (page_count(page) == 1) { + ClearPageReferenced(page); + page->age = 0; + if (PageActive(page)) { + del_page_from_active_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + } + } +} + /* * Move an inactive page to the active list. */ -static inline void activate_page_nolock(struct page * page) +void FASTCALL(activate_page_nolock(struct page *)); +void activate_page_nolock(struct page * page) { - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(page); + if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); add_page_to_active_list(page); } + + /* Make sure the page gets a fair chance at staying active. */ + page->age = max((int)page->age, PAGE_AGE_START); } +void FASTCALL(activate_page(struct page *)); void activate_page(struct page * page) { spin_lock(&pagemap_lru_lock); @@ -55,11 +154,12 @@ * lru_cache_add: add a page to the page lists * @page: the page to add */ +void FASTCALL(lru_cache_add(struct page *)); void lru_cache_add(struct page * page) { - if (!TestSetPageLRU(page)) { + if (!PageLRU(page)) { spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); + add_page_to_active_list(page); spin_unlock(&pagemap_lru_lock); } } @@ -71,14 +171,15 @@ * This function is for when the caller already holds * the pagemap_lru_lock. */ +void FASTCALL(__lru_cache_del(struct page *)); void __lru_cache_del(struct page * page) { - if (TestClearPageLRU(page)) { - if (PageActive(page)) { - del_page_from_active_list(page); - } else { - del_page_from_inactive_list(page); - } + if (PageActive(page)) { + del_page_from_active_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); } } @@ -86,6 +187,7 @@ * lru_cache_del: remove a page from the page lists * @page: the page to remove */ +void FASTCALL(lru_cache_del(struct page *)); void lru_cache_del(struct page * page) { spin_lock(&pagemap_lru_lock); diff -Nru a/mm/swap_state.c b/mm/swap_state.c --- a/mm/swap_state.c Tue Apr 9 17:36:56 2002 +++ b/mm/swap_state.c Tue Apr 9 17:36:56 2002 @@ -89,6 +89,40 @@ return 0; } +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page * page) +{ + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + + for (;;) { + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Add it to the swap cache and mark it dirty + * (adding to the page cache will clear the dirty + * and uptodate bits, so we need to do it again) + */ + if (add_to_swap_cache(page, entry) == 0) { + SetPageUptodate(page); + set_page_dirty(page); + swap_free(entry); + return 1; + } + /* Raced with "speculative" read_swap_cache_async */ + swap_free(entry); + } +} + /* * This must be called only on pages that have * been verified to be in the swap cache. diff -Nru a/mm/swapfile.c b/mm/swapfile.c --- a/mm/swapfile.c Tue Apr 9 17:36:56 2002 +++ b/mm/swapfile.c Tue Apr 9 17:36:56 2002 @@ -373,6 +373,7 @@ return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_rmap(page, dir); swap_free(entry); ++vma->vm_mm->rss; } diff -Nru a/mm/vmscan.c b/mm/vmscan.c --- a/mm/vmscan.c Tue Apr 9 17:36:56 2002 +++ b/mm/vmscan.c Tue Apr 9 17:36:56 2002 @@ -1,6 +1,9 @@ /* * linux/mm/vmscan.c * + * The pageout daemon, decides which pages to evict (swap out) and + * does the actual work of freeing them. + * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. @@ -20,9 +23,12 @@ #include #include #include +#include #include +static void refill_freelist(void); +static void wakeup_memwaiters(void); /* * The "priority" of VM scanning is how much of the queues we * will scan in one go. A value of 6 for DEF_PRIORITY implies @@ -31,368 +37,276 @@ */ #define DEF_PRIORITY (6) -/* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). - * It returns zero if it couldn't do anything, - * - * rss may decrease because pages are shared, but this - * doesn't count as having freed a page. - */ - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) +static inline void age_page_up(struct page *page) { - pte_t pte; - swp_entry_t entry; + page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); +} - /* Don't look at this pte if it's been accessed recently. */ - if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { - mark_page_accessed(page); - return 0; - } +static inline void age_page_down(struct page *page) +{ + page->age -= min(PAGE_AGE_DECL, (int)page->age); +} - /* Don't bother unmapping pages that are active */ - if (PageActive(page)) - return 0; +static inline int page_mapping_inuse(struct page * page) +{ + struct address_space * mapping = page->mapping; - /* Don't bother replenishing zones not under pressure.. */ - if (!memclass(page_zone(page), classzone)) - return 0; + /* Page is in somebody's page tables. */ + if (page->pte_chain) + return 1; - if (TryLockPage(page)) + /* XXX: does this happen ? */ + if (!mapping) return 0; - /* From this point on, the odds are that we're going to - * nuke this pte, so read and clear the pte. This hook - * is needed on CPUs which update the accessed and dirty - * bits in hardware. - */ - flush_cache_page(vma, address); - pte = ptep_get_and_clear(page_table); - flush_tlb_page(vma, address); + /* File is mmaped by somebody. */ + if (mapping->i_mmap || mapping->i_mmap_shared) + return 1; - if (pte_dirty(pte)) - set_page_dirty(page); - - /* - * Is the page already in the swap cache? If so, then - * we can just drop our reference to it without doing - * any IO - it's already up-to-date on disk. - */ - if (PageSwapCache(page)) { - entry.val = page->index; - swap_duplicate(entry); -set_swap_pte: - set_pte(page_table, swp_entry_to_pte(entry)); -drop_pte: - mm->rss--; - UnlockPage(page); - { - int freeable = page_count(page) - !!page->buffers <= 2; - page_cache_release(page); - return freeable; - } - } + return 0; +} - /* - * Is it a clean page? Then it must be recoverable - * by just paging it in again, and we can just drop - * it.. or if it's dirty but has backing store, - * just mark the page dirty and drop it. - * - * However, this won't actually free any real - * memory, as the page will just be in the page cache - * somewhere, and as such we should just continue - * our scan. - * - * Basically, this just makes it possible for us to do - * some real work in the future in "refill_inactive()". - */ - if (page->mapping) - goto drop_pte; - if (!PageDirty(page)) - goto drop_pte; +/** + * reclaim_page - reclaims one page from the inactive_clean list + * @zone: reclaim a page from this zone + * + * The pages on the inactive_clean can be instantly reclaimed. + * The tests look impressive, but most of the time we'll grab + * the first page of the list and exit successfully. + */ +struct page * reclaim_page(zone_t * zone) +{ + struct page * page = NULL; + struct list_head * page_lru; + swp_entry_t entry = {0}; + int maxscan; /* - * Anonymous buffercache pages can be left behind by - * concurrent truncate and pagefault. + * We need to hold the pagecache_lock around all tests to make sure + * reclaim_page() cannot race with find_get_page() and friends. */ - if (page->buffers) - goto preserve; + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + maxscan = zone->inactive_clean_pages; + while (maxscan-- && !list_empty(&zone->inactive_clean_list)) { + page_lru = zone->inactive_clean_list.prev; + page = list_entry(page_lru, struct page, lru); - /* - * This is a dirty, swappable page. First of all, - * get a suitable swap entry for it, and make sure - * we have the swap cache set up to associate the - * page with that swap entry. - */ - for (;;) { - entry = get_swap_page(); - if (!entry.val) - break; - /* Add it to the swap cache and mark it dirty - * (adding to the page cache will clear the dirty - * and uptodate bits, so we need to do it again) - */ - if (add_to_swap_cache(page, entry) == 0) { - SetPageUptodate(page); - set_page_dirty(page); - goto set_swap_pte; + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageInactiveClean(page))) { + printk("VM: reclaim_page, wrong page on list.\n"); + list_del(page_lru); + page_zone(page)->inactive_clean_pages--; + continue; } - /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); - } - /* No swap space left */ -preserve: - set_pte(page_table, pte); - UnlockPage(page); - return 0; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pte_t * pte; - unsigned long pmd_end; - - if (pmd_none(*dir)) - return count; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return count; - } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; + /* Page is being freed */ + if (unlikely(page_count(page)) == 0) { + list_del(page_lru); + list_add(page_lru, &zone->inactive_clean_list); + continue; + } - do { - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); + /* Page cannot be reclaimed ? Move to inactive_dirty list. */ + if (unlikely(page->pte_chain || page->buffers || + PageReferenced(page) || PageDirty(page) || + page_count(page) > 1 || TryLockPage(page))) { + del_page_from_inactive_clean_list(page); + add_page_to_inactive_dirty_list(page); + continue; + } - if (VALID_PAGE(page) && !PageReserved(page)) { - count -= try_to_swap_out(mm, vma, address, pte, page, classzone); - if (!count) { - address += PAGE_SIZE; - break; - } - } + /* OK, remove the page from the caches. */ + if (PageSwapCache(page)) { + entry.val = page->index; + __delete_from_swap_cache(page); + goto found_page; } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); - mm->swap_address = address; - return count; -} -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pmd_t * pmd; - unsigned long pgd_end; + if (page->mapping) { + __remove_inode_page(page); + goto found_page; + } - if (pgd_none(*dir)) - return count; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return count; + /* We should never ever get here. */ + printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); + list_del(page_lru); + zone->inactive_clean_pages--; + UnlockPage(page); } + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + return NULL; - pmd = pmd_offset(dir, address); - - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (pgd_end && (end > pgd_end)) - end = pgd_end; - - do { - count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); - if (!count) - break; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) -{ - pgd_t *pgdir; - unsigned long end; - - /* Don't swap out areas which are reserved */ - if (vma->vm_flags & VM_RESERVED) - return count; - - pgdir = pgd_offset(mm, address); - end = vma->vm_end; - BUG_ON(address >= end); - do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); - if (!count) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (address && (address < end)); - return count; +found_page: + del_page_from_inactive_clean_list(page); + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + if (entry.val) + swap_free(entry); + UnlockPage(page); + page->age = PAGE_AGE_START; + if (page_count(page) != 1) + printk("VM: reclaim_page, found page with count %d!\n", + page_count(page)); + return page; } -/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ -struct mm_struct *swap_mm = &init_mm; - -/* - * Returns remaining count of pages to be swapped out by followup call. +/** + * page_dirty - do we need to write the data out to disk + * @page: page to test + * + * Returns true if the page contains data which needs to + * be written to disk. Doesn't test the page tables (yet?). */ -static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) +static inline int page_dirty(struct page *page) { - unsigned long address; - struct vm_area_struct* vma; + struct buffer_head *tmp, *bh; - /* - * Find the proper vm-area after freezing the vma chain - * and ptes. - */ - spin_lock(&mm->page_table_lock); - address = mm->swap_address; - if (address == TASK_SIZE || swap_mm != mm) { - /* We raced: don't count this mm but try again */ - ++*mmcounter; - goto out_unlock; - } - vma = find_vma(mm, address); - if (vma) { - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - count = swap_out_vma(mm, vma, address, count, classzone); - vma = vma->vm_next; - if (!vma) - break; - if (!count) - goto out_unlock; - address = vma->vm_start; - } - } - /* Indicate that we reached the end of address space */ - mm->swap_address = TASK_SIZE; + if (PageDirty(page)) + return 1; -out_unlock: - spin_unlock(&mm->page_table_lock); - return count; -} + if (page->mapping && !page->buffers) + return 0; -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) -{ - int counter, nr_pages = SWAP_CLUSTER_MAX; - struct mm_struct *mm; + tmp = bh = page->buffers; - counter = mmlist_nr; do { - if (unlikely(current->need_resched)) { - __set_current_state(TASK_RUNNING); - schedule(); - } - - spin_lock(&mmlist_lock); - mm = swap_mm; - while (mm->swap_address == TASK_SIZE || mm == &init_mm) { - mm->swap_address = 0; - mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); - if (mm == swap_mm) - goto empty; - swap_mm = mm; - } - - /* Make sure the mm doesn't disappear when we drop the lock.. */ - atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); - - nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); - - mmput(mm); - - if (!nr_pages) + if (tmp->b_state & ((1<= 0); + tmp = tmp->b_this_page; + } while (tmp != bh); return 0; - -empty: - spin_unlock(&mmlist_lock); - return 0; } -static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); -static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) +/** + * page_launder_zone - clean dirty inactive pages, move to inactive_clean list + * @zone: zone to free pages in + * @gfp_mask: what operations we are allowed to do + * + * This function is called when we are low on free / inactive_clean + * pages, its purpose is to refill the free/clean list as efficiently + * as possible. + * + * This means we do writes asynchronously as long as possible and will + * only sleep on IO when we don't have another option. Since writeouts + * cause disk seeks and make read IO slower, we skip writes alltogether + * when the amount of dirty pages is small. + * + * This code is heavily inspired by the FreeBSD source code. Thanks + * go out to Matthew Dillon. + */ +#define CAN_DO_FS ((gfp_mask & __GFP_FS) && should_write) +int page_launder_zone(zone_t * zone, int gfp_mask, int priority) { + int maxscan, cleaned_pages, target; struct list_head * entry; - int max_scan = nr_inactive_pages / priority; - int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10); + target = free_plenty(zone); + cleaned_pages = 0; + + /* The main launder loop. */ spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { + maxscan = zone->inactive_dirty_pages >> priority; + while (maxscan-- && !list_empty(&zone->inactive_dirty_list)) { struct page * page; - - if (unlikely(current->need_resched)) { + + /* Low latency reschedule point */ + if (current->need_resched) { spin_unlock(&pagemap_lru_lock); - __set_current_state(TASK_RUNNING); schedule(); spin_lock(&pagemap_lru_lock); continue; } + entry = zone->inactive_dirty_list.prev; page = list_entry(entry, struct page, lru); - BUG_ON(!PageLRU(page)); - BUG_ON(PageActive(page)); + if (cleaned_pages > target) + break; list_del(entry); - list_add(entry, &inactive_list); + list_add(entry, &zone->inactive_dirty_list); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageInactiveDirty(page)) { + printk("VM: page_launder, wrong page on list.\n"); + list_del(entry); + nr_inactive_dirty_pages--; + page_zone(page)->inactive_dirty_pages--; + continue; + } /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. + * The page is in active use or really unfreeable. Move to + * the active list and adjust the page age if needed. */ - if (unlikely(!page_count(page))) + if (page_referenced(page) && page_mapping_inuse(page) && + !page_over_rsslimit(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + page->age = max((int)page->age, PAGE_AGE_START); continue; + } - if (!memclass(page_zone(page), classzone)) + /* + * Page is being freed, don't worry about it. + */ + if (unlikely(page_count(page)) == 0) continue; - /* Racy check to avoid trylocking when not worthwhile */ - if (!page->buffers && (page_count(page) != 1 || !page->mapping)) - goto page_mapped; - /* * The page is locked. IO in progress? * Move it to the back of the list. */ - if (unlikely(TryLockPage(page))) { - if (PageLaunder(page) && (gfp_mask & __GFP_FS)) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - wait_on_page(page); + if (unlikely(TryLockPage(page))) + continue; + + /* + * Anonymous process memory without backing store. Try to + * allocate it some swap space here. + * + * XXX: implement swap clustering ? + */ + if (page->pte_chain && !page->mapping && !page->buffers) { + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + if (!add_to_swap(page)) { + activate_page(page); + UnlockPage(page); page_cache_release(page); spin_lock(&pagemap_lru_lock); + continue; } - continue; + page_cache_release(page); + spin_lock(&pagemap_lru_lock); } - if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) { + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page->pte_chain) { + switch (try_to_unmap(page)) { + case SWAP_ERROR: + case SWAP_FAIL: + goto page_active; + case SWAP_AGAIN: + UnlockPage(page); + continue; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page) && page->mapping) { /* * It is not critical here to write it only if * the page is unmapped beause any direct writer * like O_DIRECT would set the PG_dirty bitflag - * on the phisical page after having successfully + * on the physical page after having successfully * pinned it and after the I/O to the page is finished, * so the direct writes to the page cannot get lost. */ @@ -421,7 +335,7 @@ if (page->buffers) { spin_unlock(&pagemap_lru_lock); - /* avoid to free a locked page */ + /* To avoid freeing our page before we're done. */ page_cache_get(page); if (try_to_release_page(page, gfp_mask)) { @@ -439,14 +353,14 @@ /* effectively free the page here */ page_cache_release(page); - if (--nr_pages) - continue; - break; + cleaned_pages++; + continue; } else { /* - * The page is still in pagecache so undo the stuff - * before the try_to_release_page since we've not - * finished and we can now try the next step. + * We freed the buffers but may have + * slept; undo the stuff we did before + * try_to_release_page and fall through + * to the next step. */ page_cache_release(page); @@ -462,227 +376,268 @@ } } - spin_lock(&pagecache_lock); /* - * this is the non-racy check for busy page. + * If the page is really freeable now, move it to the + * inactive_clean list. + * + * We re-test everything since the page could have been + * used by somebody else while we waited on IO above. + * This test is not safe from races, but only the one + * in reclaim_page() needs to be. */ - if (!page->mapping || !is_page_cache_freeable(page)) { - spin_unlock(&pagecache_lock); + if (page->mapping && !PageDirty(page) && !page->pte_chain && + page_count(page) == 1) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); UnlockPage(page); -page_mapped: - if (--max_mapped >= 0) - continue; - + cleaned_pages++; + } else { /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! + * OK, we don't know what to do with the page. + * It's no use keeping it here, so we move it to + * the active list. */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; +page_active: + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + UnlockPage(page); + } + } + spin_unlock(&pagemap_lru_lock); + + /* Return the number of pages moved to the inactive_clean list. */ + return cleaned_pages; +} + +/** + * page_launder - clean dirty inactive pages, move to inactive_clean list + * @gfp_mask: what operations we are allowed to do + * + * This function iterates over all zones and calls page_launder_zone(), + * balancing still needs to be added... + */ +int page_launder(int gfp_mask) +{ + int maxtry = 1 << DEF_PRIORITY; + struct zone_struct * zone; + int freed = 0; + + /* Global balancing while we have a global shortage. */ + while (maxtry-- && free_high(ALL_ZONES) >= 0) { + for_each_zone(zone) + if (free_plenty(zone) >= 0) + freed += page_launder_zone(zone, gfp_mask, 6); + } + + /* Clean up the remaining zones with a serious shortage, if any. */ + for_each_zone(zone) + if (free_min(zone) >= 0) + freed += page_launder_zone(zone, gfp_mask, 0); + + return freed; +} + +/** + * refill_inactive_zone - scan the active list and find pages to deactivate + * @priority: how much are we allowed to scan + * + * This function will scan a portion of the active list of a zone to find + * unused pages, those pages will then be moved to the inactive list. + */ +int refill_inactive_zone(struct zone_struct * zone, int priority) +{ + int maxscan = zone->active_pages >> priority; + int target = inactive_high(zone); + struct list_head * page_lru; + int nr_deactivated = 0; + struct page * page; + + /* Take the lock while messing with the list... */ + spin_lock(&pagemap_lru_lock); + while (maxscan-- && !list_empty(&zone->active_list)) { + page_lru = zone->active_list.prev; + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageActive(page))) { + printk("VM: refill_inactive, wrong page on list.\n"); + list_del(page_lru); + nr_active_pages--; + continue; } /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. + * If the object the page is in is not in use we don't + * bother with page aging. If the page is touched again + * while on the inactive_clean list it'll be reactivated. */ - if (PageDirty(page)) { - spin_unlock(&pagecache_lock); - UnlockPage(page); + if (!page_mapping_inuse(page)) { + drop_page(page); continue; } - /* point of no return */ - if (likely(!PageSwapCache(page))) { - __remove_inode_page(page); - spin_unlock(&pagecache_lock); + /* + * Do aging on the pages. + */ + if (page_referenced(page)) { + age_page_up(page); } else { - swp_entry_t swap; - swap.val = page->index; - __delete_from_swap_cache(page); - spin_unlock(&pagecache_lock); - swap_free(swap); + age_page_down(page); } - __lru_cache_del(page); - UnlockPage(page); - - /* effectively free the page here */ - page_cache_release(page); + /* + * If the page age is 'hot' and the process using the + * page doesn't exceed its RSS limit we keep the page. + * Otherwise we move it to the inactive_dirty list. + */ + if (page->age && !page_over_rsslimit(page)) { + list_del(page_lru); + list_add(page_lru, &zone->active_list); + } else { + deactivate_page_nolock(page); + if (++nr_deactivated > target) + break; + } - if (--nr_pages) - continue; - break; + /* Low latency reschedule point */ + if (current->need_resched) { + spin_unlock(&pagemap_lru_lock); + schedule(); + spin_lock(&pagemap_lru_lock); + } } spin_unlock(&pagemap_lru_lock); - return nr_pages; + return nr_deactivated; } -/* - * This moves pages from the active list to - * the inactive list. +/** + * refill_inactive - checks all zones and refills the inactive list as needed * - * We move them the other way when we see the - * reference bit on the page. + * This function tries to balance page eviction from all zones by aging + * the pages from each zone in the same ratio until the global inactive + * shortage is resolved. After that it does one last "clean-up" scan to + * fix up local inactive shortages. */ -static void refill_inactive(int nr_pages) +int refill_inactive(void) { - struct list_head * entry; - - spin_lock(&pagemap_lru_lock); - entry = active_list.prev; - while (nr_pages && entry != &active_list) { - struct page * page; + int maxtry = 1 << DEF_PRIORITY; + zone_t * zone; + int ret = 0; - page = list_entry(entry, struct page, lru); - entry = entry->prev; - if (PageTestandClearReferenced(page)) { - list_del(&page->lru); - list_add(&page->lru, &active_list); - continue; + /* Global balancing while we have a global shortage. */ + while (maxtry-- && inactive_low(ALL_ZONES) >= 0) { + for_each_zone(zone) { + if (inactive_high(zone) >= 0) + ret += refill_inactive_zone(zone, DEF_PRIORITY); } + } - nr_pages--; - - del_page_from_active_list(page); - add_page_to_inactive_list(page); - SetPageReferenced(page); + /* Local balancing for zones which really need it. */ + for_each_zone(zone) { + if (inactive_min(zone) >= 0) + ret += refill_inactive_zone(zone, 0); } - spin_unlock(&pagemap_lru_lock); + + return ret; } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +/** + * background_aging - slow background aging of zones + * @priority: priority at which to scan + * + * When the VM load is low or nonexistant, this function is + * called once a second to "sort" the pages in the VM. This + * way we know which pages to evict once a load spike happens. + * The effects of this function are very slow, the CPU usage + * should be minimal to nonexistant under most loads. + */ +static inline void background_aging(int priority) { - int chunk_size = nr_pages; - unsigned long ratio; + struct zone_struct * zone; - nr_pages -= kmem_cache_reap(gfp_mask); - if (nr_pages <= 0) - return 0; - - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); + for_each_zone(zone) + if (inactive_high(zone) > 0) + refill_inactive_zone(zone, priority); +} - nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority); - if (nr_pages <= 0) - return 0; +/* + * Worker function for kswapd and try_to_free_pages, we get + * called whenever there is a shortage of free/inactive_clean + * pages. + * + * This function will also move pages to the inactive list, + * if needed. + */ +static int do_try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 0; - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); + /* + * Eat memory from filesystem page cache, buffer cache, + * dentry, inode and filesystem quota caches. + */ + ret += page_launder(gfp_mask); + ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_icache_memory(1, gfp_mask); #ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); #endif - return nr_pages; -} + /* + * Move pages from the active list to the inactive list. + */ + refill_inactive(); -int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) -{ - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + /* + * Reclaim unused slab cache memory. + */ + ret += kmem_cache_reap(gfp_mask); - gfp_mask = pf_gfp_mask(gfp_mask); - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + refill_freelist(); + + /* Start IO when needed. */ + if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0) + run_task_queue(&tq_disk); /* * Hmm.. Cache shrink failed - time to kill something? * Mhwahahhaha! This is the part I really like. Giggle. */ - out_of_memory(); - return 0; -} + if (!ret && free_min(ANY_ZONE) > 0) + out_of_memory(); -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - -static int check_classzone_need_balance(zone_t * classzone) -{ - zone_t * first_classzone; - - first_classzone = classzone->zone_pgdat->node_zones; - while (classzone >= first_classzone) { - if (classzone->free_pages > classzone->pages_high) - return 0; - classzone--; - } - return 1; + return ret; } -static int kswapd_balance_pgdat(pg_data_t * pgdat) +/** + * refill_freelist - move inactive_clean pages to free list if needed + * + * Move some pages from the inactive_clean lists to the free + * lists so atomic allocations have pages to work from. This + * function really only does something when we don't have a + * userspace load on __alloc_pages(). + * + * We refill the freelist in a bump from pages_min to pages_min * 2 + * in order to give the buddy allocator something to play with. + */ +static void refill_freelist(void) { - int need_more_balance = 0, i; + struct page * page; zone_t * zone; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (unlikely(current->need_resched)) - schedule(); - if (!zone->need_balance) - continue; - if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { - zone->need_balance = 0; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + for_each_zone(zone) { + if (!zone->size || zone->free_pages >= zone->pages_min) continue; - } - if (check_classzone_need_balance(zone)) - need_more_balance = 1; - else - zone->need_balance = 0; - } - - return need_more_balance; -} - -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->node_next)); - } while (need_more_balance); -} - -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) -{ - zone_t * zone; - int i; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; + while (zone->free_pages < zone->pages_min * 2) { + page = reclaim_page(zone); + if (!page) + break; + __free_page(page); + } } - - return 1; -} - -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); - - return 1; } /* @@ -701,7 +656,6 @@ int kswapd(void *unused) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); daemonize(); strcpy(tsk->comm, "kswapd"); @@ -725,24 +679,156 @@ * Kswapd main loop. */ for (;;) { - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + static long recalc = 0; - mb(); - if (kswapd_can_sleep()) - schedule(); + /* + * We try to rebalance the VM either when we have a + * global shortage of free pages or when one particular + * zone is very short on free pages. + */ + if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0) + do_try_to_free_pages(GFP_KSWAPD); + + refill_freelist(); + + /* Once a second ... */ + if (time_after(jiffies, recalc + HZ)) { + recalc = jiffies; - __set_current_state(TASK_RUNNING); + /* Do background page aging. */ + background_aging(DEF_PRIORITY); + } + + wakeup_memwaiters(); + } +} + +static int kswapd_overloaded; +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +DECLARE_WAIT_QUEUE_HEAD(kswapd_done); +#define VM_SHOULD_SLEEP ((free_low(ALL_ZONES) > (freepages.min / 2)) && \ + !kswapd_overloaded) + +/** + * wakeup_kswapd - wake up the pageout daemon + * gfp_mask: page freeing flags + * + * This function wakes up kswapd and can, under heavy VM pressure, + * put the calling task to sleep temporarily. + */ +void wakeup_kswapd(unsigned int gfp_mask) +{ + DECLARE_WAITQUEUE(wait, current); + + /* If we're in the memory freeing business ourself, don't sleep + * but just wake kswapd and go back to businesss. + */ + if (current->flags & PF_MEMALLOC) { + wake_up_interruptible(&kswapd_wait); + return; + } + + /* We need all of kswapd's GFP flags, otherwise we can't sleep on it. + * We still wake kswapd of course. + */ + if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) { + wake_up_interruptible(&kswapd_wait); + return; + } + + add_wait_queue(&kswapd_done, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Wake kswapd .... */ + wake_up_interruptible(&kswapd_wait); + + /* ... and check if we need to wait on it */ + if (VM_SHOULD_SLEEP) + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_done, &wait); +} + +static void wakeup_memwaiters(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&kswapd_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + /* Don't let the processes waiting on memory get stuck, ever. */ + wake_up(&kswapd_done); + + /* Enough free RAM, we can easily keep up with memory demand. */ + if (free_high(ALL_ZONES) <= 0) { + schedule_timeout(HZ); remove_wait_queue(&kswapd_wait, &wait); + return; + } + remove_wait_queue(&kswapd_wait, &wait); - /* - * If we actually get into a low-memory situation, - * the processes needing more memory will wake us - * up on a more timely basis. - */ - kswapd_balance(); - run_task_queue(&tq_disk); + /* OK, the VM is very loaded. Sleep instead of using all CPU. */ + kswapd_overloaded = 1; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 4); + kswapd_overloaded = 0; + return; +} + +/** + * try_to_free_pages - run the pageout code ourselves + * gfp_mask: mask of things the pageout code is allowed to do + * + * When the load on the system gets higher, it can happen + * that kswapd no longer manages to keep enough memory + * free. In those cases user programs allocating memory + * will call try_to_free_pages() and help the pageout code. + * This has the effects of freeing memory and slowing down + * the largest memory hogs a bit. + */ +int try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 1; + + gfp_mask = pf_gfp_mask(gfp_mask); + if (gfp_mask & __GFP_WAIT) { + current->flags |= PF_MEMALLOC; + ret = do_try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; } + + return ret; +} + +/** + * rss_free_pages - run part of the pageout code and slow down a bit + * @gfp_mask: mask of things the pageout code is allowed to do + * + * This function is called when a task is over its RSS limit and + * has a page fault. It's goal is to free some memory so non-hogs + * can run faster and slow down itself when needed so it won't eat + * the memory non-hogs can use. + */ +void rss_free_pages(unsigned int gfp_mask) +{ + long pause = 0; + + if (current->flags & PF_MEMALLOC) + return; + + current->flags |= PF_MEMALLOC; + + do { + page_launder(gfp_mask); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(pause); + set_current_state(TASK_RUNNING); + pause++; + } while (free_high(ALL_ZONES) >= 0); + + current->flags &= ~PF_MEMALLOC; + return; } static int __init kswapd_init(void)