diff -urN linux-2.4.33-pre2-pax/Documentation/vm/overcommit-accounting linux-2.4.33-pre2-oc+pax-fixed/Documentation/vm/overcommit-accounting --- linux-2.4.33-pre2-pax/Documentation/vm/overcommit-accounting 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/Documentation/vm/overcommit-accounting 2006-03-01 00:02:28.000000000 +0100 @@ -0,0 +1,77 @@ +* This describes the overcommit management facility in the latest kernel + tree (FIXME: actually it also describes the stuff that isnt yet done) + +The Linux kernel supports four overcommit handling modes + +0 - Heuristic overcommit handling. Obvious overcommits of + address space are refused. Used for a typical system. It + ensures a seriously wild allocation fails while allowing + overcommit to reduce swap usage + +1 - No overcommit handling. Appropriate for some scientific + applications + +2 - (NEW) strict overcommit. The total address space commit + for the system is not permitted to exceed swap + half ram. + In almost all situations this means a process will not be + killed while accessing pages but only by malloc failures + that are reported back by the kernel mmap/brk code. + +3 - (NEW) paranoid overcommit The total address space commit + for the system is not permitted to exceed swap. The machine + will never kill a process accessing pages it has mapped + except due to a bug (ie report it!) + +The overcommit policy is set via the sysctl `vm.overcommit_memory'. + +The overcommit percentage is set via `vm.overcommit_ratio'. + +The current overcommit limit and amount committed are viewable in +/proc/meminfo as CommitLimit and Committed_AS respectively. + +Gotchas +------- + +The C language stack growth does an implicit mremap. If you want absolute +guarantees and run close to the edge you MUST mmap your stack for the +largest size you think you will need. For typical stack usage is does +not matter much but its a corner case if you really really care + +In modes 2 and 3 the MAP_NORESERVE flag is ignored. + + +How It Works +------------ + +The overcommit is based on the following rules + +For a file backed map + SHARED or READ-only - 0 cost (the file is the map not swap) + PRIVATE WRITABLE - size of mapping per instance + +For an anonymous or /dev/zero map + SHARED - size of mapping + PRIVATE READ-only - 0 cost (but of little use) + PRIVATE WRITABLE - size of mapping per instance + +Additional accounting + Pages made writable copies by mmap + shmfs memory drawn from the same pool + +Status +------ + +o We account mmap memory mappings +o We account mprotect changes in commit +o We account mremap changes in size +o We account brk +o We account munmap +o We report the commit status in /proc +o Account and check on fork +o Review stack handling/building on exec +o SHMfs accounting +o Implement actual limit enforcement + +To Do +----- +o Account ptrace pages (this is hard) diff -urN linux-2.4.33-pre2-pax/arch/alpha/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/alpha/mm/fault.c --- linux-2.4.33-pre2-pax/arch/alpha/mm/fault.c 2006-03-01 00:02:56.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/alpha/mm/fault.c 2006-03-01 00:03:55.000000000 +0100 @@ -254,8 +254,6 @@ goto bad_area; if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* diff -urN linux-2.4.33-pre2-pax/arch/arm/mm/fault-common.c linux-2.4.33-pre2-oc+pax-fixed/arch/arm/mm/fault-common.c --- linux-2.4.33-pre2-pax/arch/arm/mm/fault-common.c 2003-08-25 13:44:39.000000000 +0200 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/arm/mm/fault-common.c 2006-03-01 00:02:26.000000000 +0100 @@ -254,7 +254,7 @@ goto survive; check_stack: - if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) + if (!expand_stack(vma, addr)) goto good_area; out: return fault; diff -urN linux-2.4.33-pre2-pax/arch/i386/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/i386/mm/fault.c --- linux-2.4.33-pre2-pax/arch/i386/mm/fault.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/i386/mm/fault.c 2006-03-01 00:03:56.000000000 +0100 @@ -78,9 +78,7 @@ return 1; check_stack: - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, start) == 0) + if (!expand_stack(vma, start)) goto good_area; bad_area: diff -urN linux-2.4.33-pre2-pax/arch/ia64/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/ia64/mm/fault.c --- linux-2.4.33-pre2-pax/arch/ia64/mm/fault.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/ia64/mm/fault.c 2006-03-01 00:03:56.000000000 +0100 @@ -226,8 +226,6 @@ check_expansion: if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if (rgn_index(address) != rgn_index(vma->vm_start) || rgn_offset(address) >= RGN_MAP_LIMIT) goto bad_area; diff -urN linux-2.4.33-pre2-pax/arch/mips/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/mips/mm/fault.c --- linux-2.4.33-pre2-pax/arch/mips/mm/fault.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/mips/mm/fault.c 2006-03-01 00:03:56.000000000 +0100 @@ -129,8 +129,6 @@ goto bad_area; if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* diff -urN linux-2.4.33-pre2-pax/arch/mips64/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/mips64/mm/fault.c --- linux-2.4.33-pre2-pax/arch/mips64/mm/fault.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/mips64/mm/fault.c 2006-03-01 00:03:56.000000000 +0100 @@ -153,8 +153,6 @@ goto bad_area; if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* diff -urN linux-2.4.33-pre2-pax/arch/ppc/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/ppc/mm/fault.c --- linux-2.4.33-pre2-pax/arch/ppc/mm/fault.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/ppc/mm/fault.c 2006-03-01 00:03:56.000000000 +0100 @@ -512,42 +512,40 @@ goto bad_area; if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (!is_write) - goto bad_area; - - /* - * N.B. The rs6000/xcoff ABI allows programs to access up to - * a few hundred bytes below the stack pointer. - * The kernel signal delivery code writes up to about 1.5kB - * below the stack pointer (r1) before decrementing it. - * The exec code can write slightly over 640kB to the stack - * before setting the user r1. Thus we allow the stack to - * expand to 1MB without further checks. - */ - if (address + 0x100000 < vma->vm_end) { - /* get user regs even if this fault is in kernel mode */ - struct pt_regs *uregs = current->thread.regs; - if (uregs == NULL) - goto bad_area; - - /* - * A user-mode access to an address a long way below - * the stack pointer is only valid if the instruction - * is one which would update the stack pointer to the - * address accessed if the instruction completed, - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb - * (or the byte, halfword, float or double forms). - * - * If we don't check this then any write to the area - * between the last mapped region and the stack will - * expand the stack rather than segfaulting. - */ - if (address + 2048 < uregs->gpr[1] - && (!user_mode(regs) || !store_updates_sp(regs))) - goto bad_area; - } + if (!is_write) + goto bad_area; + + /* + * N.B. The rs6000/xcoff ABI allows programs to access up to + * a few hundred bytes below the stack pointer. + * The kernel signal delivery code writes up to about 1.5kB + * below the stack pointer (r1) before decrementing it. + * The exec code can write slightly over 640kB to the stack + * before setting the user r1. Thus we allow the stack to + * expand to 1MB without further checks. + */ + if (address + 0x100000 < vma->vm_end) { + /* get user regs even if this fault is in kernel mode */ + struct pt_regs *uregs = current->thread.regs; + if (uregs == NULL) + goto bad_area; + + /* + * A user-mode access to an address a long way below + * the stack pointer is only valid if the instruction + * is one which would update the stack pointer to the + * address accessed if the instruction completed, + * i.e. either stwu rs,n(r1) or stwux rs,r1,rb + * (or the byte, halfword, float or double forms). + * + * If we don't check this then any write to the area + * between the last mapped region and the stack will + * expand the stack rather than segfaulting. + */ + if (address + 2048 < uregs->gpr[1] + && (!user_mode(regs) || !store_updates_sp(regs))) + goto bad_area; + } if (expand_stack(vma, address)) goto bad_area; diff -urN linux-2.4.33-pre2-pax/arch/sh/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/sh/mm/fault.c --- linux-2.4.33-pre2-pax/arch/sh/mm/fault.c 2003-08-25 13:44:40.000000000 +0200 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/sh/mm/fault.c 2006-03-01 00:02:26.000000000 +0100 @@ -76,8 +76,6 @@ return 1; check_stack: - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if (expand_stack(vma, start) == 0) goto good_area; diff -urN linux-2.4.33-pre2-pax/arch/sparc/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/sparc/mm/fault.c --- linux-2.4.33-pre2-pax/arch/sparc/mm/fault.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/sparc/mm/fault.c 2006-03-01 00:03:56.000000000 +0100 @@ -539,8 +539,6 @@ goto bad_area; if(vma->vm_start <= address) goto good_area; - if(!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if(expand_stack(vma, address)) goto bad_area; /* @@ -809,8 +807,6 @@ goto bad_area; if(vma->vm_start <= address) goto good_area; - if(!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if(expand_stack(vma, address)) goto bad_area; good_area: diff -urN linux-2.4.33-pre2-pax/arch/sparc64/mm/fault.c linux-2.4.33-pre2-oc+pax-fixed/arch/sparc64/mm/fault.c --- linux-2.4.33-pre2-pax/arch/sparc64/mm/fault.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/arch/sparc64/mm/fault.c 2006-03-01 00:03:56.000000000 +0100 @@ -801,8 +801,6 @@ if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; if (!(fault_code & FAULT_CODE_WRITE)) { /* Non-faulting loads shouldn't expand stack. */ insn = get_fault_insn(regs, insn); diff -urN linux-2.4.33-pre2-pax/fs/exec.c linux-2.4.33-pre2-oc+pax-fixed/fs/exec.c --- linux-2.4.33-pre2-pax/fs/exec.c 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/fs/exec.c 2006-03-01 00:36:30.000000000 +0100 @@ -374,6 +374,12 @@ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!mpnt) return -ENOMEM; + + if (!vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) + { + kmem_cache_free(vm_area_cachep, mpnt); + return -ENOMEM; + } #ifdef CONFIG_PAX_SEGMEXEC if ((current->mm->pax_flags & MF_PAX_SEGMEXEC) && (VM_STACK_FLAGS & VM_MAYEXEC)) { @@ -412,7 +418,6 @@ if (mpnt_m) kmem_cache_free(vm_area_cachep, mpnt_m); #endif - return ret; } current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; diff -urN linux-2.4.33-pre2-pax/fs/proc/proc_misc.c linux-2.4.33-pre2-oc+pax-fixed/fs/proc/proc_misc.c --- linux-2.4.33-pre2-pax/fs/proc/proc_misc.c 2004-08-08 01:26:06.000000000 +0200 +++ linux-2.4.33-pre2-oc+pax-fixed/fs/proc/proc_misc.c 2006-03-01 00:02:28.000000000 +0100 @@ -158,6 +158,8 @@ struct sysinfo i; int len; int pg_size ; + unsigned long committed; + unsigned long allowed; /* * display in kilobytes. @@ -167,6 +169,9 @@ si_meminfo(&i); si_swapinfo(&i); pg_size = page_cache_size - i.bufferram; + committed = atomic_read(&vm_committed_space); + allowed = (i.totalram * sysctl_overcommit_ratio / 100) + + total_swap_pages; len = sprintf(page, " total: used: free: shared: buffers: cached:\n" "Mem: %8Lu %8Lu %8Lu %8Lu %8Lu %8Lu\n" @@ -194,7 +199,9 @@ "LowTotal: %8lu kB\n" "LowFree: %8lu kB\n" "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n", + "SwapFree: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.sharedram), @@ -208,7 +215,9 @@ K(i.totalram-i.totalhigh), K(i.freeram-i.freehigh), K(i.totalswap), - K(i.freeswap)); + K(i.freeswap), + K(allowed), + K(committed)); return proc_calc_metrics(page, start, off, count, eof, len); #undef B diff -urN linux-2.4.33-pre2-pax/include/asm-x86_64/page.h linux-2.4.33-pre2-oc+pax-fixed/include/asm-x86_64/page.h --- linux-2.4.33-pre2-pax/include/asm-x86_64/page.h 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/include/asm-x86_64/page.h 2006-03-01 00:36:30.000000000 +0100 @@ -158,7 +158,6 @@ #define VM_DATA_DEFAULT_FLAGS \ ((current->thread.flags & THREAD_IA32) ? vm_data_default_flags32 : \ vm_data_default_flags) -#define VM_STACK_FLAGS vm_stack_flags #endif #endif /* __KERNEL__ */ diff -urN linux-2.4.33-pre2-pax/include/linux/mm.h linux-2.4.33-pre2-oc+pax-fixed/include/linux/mm.h --- linux-2.4.33-pre2-pax/include/linux/mm.h 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/include/linux/mm.h 2006-03-01 00:38:34.000000000 +0100 @@ -115,19 +115,21 @@ #define VM_MAYNOTWRITE 0x00200000 /* vma cannot be granted VM_WRITE any more */ #endif +#define VM_ACCOUNT 0x00400000 /* Memory is a vm accounted object */ + #ifdef __VM_STACK_FLAGS #ifdef ARCH_STACK_GROWSUP -#define VM_STACK_FLAGS (0x00000233 | __VM_STACK_FLAGS) +#define VM_STACK_FLAGS (VM_GROWSUP|VM_MAYWRITE|VM_MAYREAD|VM_WRITE|VM_READ|__VM_STACK_FLAGS|VM_ACCOUNT) #else -#define VM_STACK_FLAGS (0x00000133 | __VM_STACK_FLAGS) +#define VM_STACK_FLAGS (VM_GROWSDOWN|VM_MAYWRITE|VM_MAYREAD|VM_WRITE|VM_READ|__VM_STACK_FLAGS|VM_ACCOUNT) #endif #endif #ifndef VM_STACK_FLAGS #ifdef ARCH_STACK_GROWSUP -#define VM_STACK_FLAGS 0x00000277 +#define VM_STACK_FLAGS (VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT) #else -#define VM_STACK_FLAGS 0x00000177 +#define VM_STACK_FLAGS (VM_DATA_DEFAULT_FLAGS|VM_GROWSDOWN|VM_ACCOUNT) #endif #endif @@ -674,96 +676,14 @@ return gfp_mask; } +/* Do stack extension */ +extern int expand_stack(struct vm_area_struct * vma, unsigned long address); + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/* vma is the first one with address < vma->vm_end, - * and even address < vma->vm_start. Have to extend vma. */ -static inline int expand_stack(struct vm_area_struct * vma, unsigned long address) -{ - unsigned long grow; - -#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) - struct vm_area_struct * vma_m = NULL; - unsigned long address_m = 0UL; -#endif - - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the - * page_table_lock lock to serialize against concurrent expand_stacks. - */ - address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); - - /* already expanded while we were spinning? */ - if (vma->vm_start <= address) { - spin_unlock(&vma->vm_mm->page_table_lock); - return 0; - } - - grow = (vma->vm_start - address) >> PAGE_SHIFT; - -#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) - if (vma->vm_flags & VM_MIRROR) { - address_m = vma->vm_start + vma->vm_mirror; - vma_m = find_vma(vma->vm_mm, address_m); - if (!vma_m || vma_m->vm_start != address_m || !(vma_m->vm_flags & VM_MIRROR) || - vma->vm_end - vma->vm_start != vma_m->vm_end - vma_m->vm_start) { - printk(KERN_ERR "PAX: VMMIRROR: expand bug, %08lx, %08lx, %08lx, %08lx, %08lx\n", - address, vma->vm_start, vma_m->vm_start, vma->vm_end, vma_m->vm_end); - spin_unlock(&vma->vm_mm->page_table_lock); - return -ENOMEM; - } - - address_m = address + vma->vm_mirror; - if (2*grow < grow || vma_m->vm_end - address_m > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma_m->vm_mm->total_vm + 2*grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur || - ((vma_m->vm_flags & VM_LOCKED) && - ((vma_m->vm_mm->locked_vm + 2*grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur)) { - spin_unlock(&vma->vm_mm->page_table_lock); - return -ENOMEM; - } - } else -#endif - - if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur || - ((vma->vm_flags & VM_LOCKED) && - ((vma->vm_mm->locked_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur)) { - spin_unlock(&vma->vm_mm->page_table_lock); - return -ENOMEM; - } - - if ((vma->vm_flags & VM_LOCKED) && - ((vma->vm_mm->locked_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); - return -ENOMEM; - } - - - vma->vm_start = address; - vma->vm_pgoff -= grow; - vma->vm_mm->total_vm += grow; - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - -#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) - if (vma->vm_flags & VM_MIRROR) { - vma_m->vm_start = address_m; - vma_m->vm_pgoff -= grow; - vma_m->vm_mm->total_vm += grow; - if (vma_m->vm_flags & VM_LOCKED) - vma_m->vm_mm->locked_vm += grow; - } -#endif - - spin_unlock(&vma->vm_mm->page_table_lock); - return 0; -} - /* Look up the first VMA which intersects the interval start_addr..end_addr-1, NULL if none. Assume start_addr < end_addr. */ static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) diff -urN linux-2.4.33-pre2-pax/include/linux/mman.h linux-2.4.33-pre2-oc+pax-fixed/include/linux/mman.h --- linux-2.4.33-pre2-pax/include/linux/mman.h 2006-02-26 22:54:24.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/include/linux/mman.h 2006-03-01 00:02:28.000000000 +0100 @@ -6,4 +6,10 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 +extern int vm_enough_memory(long pages); +extern void vm_unacct_memory(long pages); +extern void vm_validate_enough(char *x); +extern atomic_t vm_committed_space; +extern int sysctl_overcommit_ratio; + #endif /* _LINUX_MMAN_H */ diff -urN linux-2.4.33-pre2-pax/include/linux/sysctl.h linux-2.4.33-pre2-oc+pax-fixed/include/linux/sysctl.h --- linux-2.4.33-pre2-pax/include/linux/sysctl.h 2006-03-01 00:02:57.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/include/linux/sysctl.h 2006-03-01 00:03:57.000000000 +0100 @@ -170,6 +170,7 @@ VM_LAPTOP_MODE=21, /* kernel in laptop flush mode */ VM_BLOCK_DUMP=22, /* dump fs activity to log */ VM_ANON_LRU=23, /* immediatly insert anon pages in the vm page lru */ + VM_OVERCOMMIT_RATIO=24, /* percent of RAM to allow overcommit in */ }; diff -urN linux-2.4.33-pre2-pax/kernel/fork.c linux-2.4.33-pre2-oc+pax-fixed/kernel/fork.c --- linux-2.4.33-pre2-pax/kernel/fork.c 2005-01-27 18:57:34.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/kernel/fork.c 2006-03-01 00:02:26.000000000 +0100 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -146,6 +147,7 @@ { struct vm_area_struct * mpnt, *tmp, **pprev; int retval; + unsigned long charge; flush_cache_mm(current->mm); mm->locked_vm = 0; @@ -174,6 +176,13 @@ retval = -ENOMEM; if(mpnt->vm_flags & VM_DONTCOPY) continue; + charge = 0; + if(mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if(!vm_enough_memory(len)) + goto fail_nomem; + charge = len; + } tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!tmp) goto fail_nomem; @@ -217,10 +226,12 @@ } retval = 0; build_mmap_rb(mm); - -fail_nomem: +out: flush_tlb_mm(current->mm); return retval; +fail_nomem: + vm_unacct_memory(charge); + goto out; } spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; diff -urN linux-2.4.33-pre2-pax/kernel/sysctl.c linux-2.4.33-pre2-oc+pax-fixed/kernel/sysctl.c --- linux-2.4.33-pre2-pax/kernel/sysctl.c 2006-03-01 00:02:58.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/kernel/sysctl.c 2006-03-01 00:03:57.000000000 +0100 @@ -45,6 +45,7 @@ extern int C_A_D; extern int bdf_prm[], bdflush_min[], bdflush_max[]; extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; extern int max_threads; extern atomic_t nr_queued_signals; extern int max_queued_signals; @@ -319,6 +320,8 @@ &bdflush_min, &bdflush_max}, {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory, sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec}, + {VM_OVERCOMMIT_RATIO, "overcommit_ratio", &sysctl_overcommit_ratio, + sizeof(sysctl_overcommit_ratio), 0644, NULL, &proc_dointvec}, {VM_PAGERDAEMON, "kswapd", &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec}, {VM_PGT_CACHE, "pagetable_cache", diff -urN linux-2.4.33-pre2-pax/mm/mlock.c linux-2.4.33-pre2-oc+pax-fixed/mm/mlock.c --- linux-2.4.33-pre2-pax/mm/mlock.c 2006-03-01 00:02:58.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/mm/mlock.c 2006-03-01 00:03:57.000000000 +0100 @@ -252,6 +252,7 @@ unsigned long lock_limit; int error = -ENOMEM; + vm_validate_enough("entering sys_mlock"); down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; @@ -274,6 +275,7 @@ error = do_mlock(start, len, 1); out: up_write(¤t->mm->mmap_sem); + vm_validate_enough("exiting sys_mlock"); return error; } @@ -281,11 +283,13 @@ { int ret; + vm_validate_enough("entering sys_munlock"); down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); + vm_validate_enough("exiting sys_munlock"); return ret; } @@ -332,6 +336,8 @@ unsigned long lock_limit; int ret = -EINVAL; + vm_validate_enough("entering sys_mlockall"); + down_write(¤t->mm->mmap_sem); if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) goto out; @@ -351,15 +357,18 @@ ret = do_mlockall(flags); out: up_write(¤t->mm->mmap_sem); + vm_validate_enough("exiting sys_mlockall"); return ret; } asmlinkage long sys_munlockall(void) { int ret; + vm_validate_enough("entering sys_munlockall"); down_write(¤t->mm->mmap_sem); ret = do_mlockall(0); up_write(¤t->mm->mmap_sem); + vm_validate_enough("exiting sys_munlockall"); return ret; } diff -urN linux-2.4.33-pre2-pax/mm/mmap.c linux-2.4.33-pre2-oc+pax-fixed/mm/mmap.c --- linux-2.4.33-pre2-pax/mm/mmap.c 2006-03-01 00:02:58.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/mm/mmap.c 2006-03-01 01:04:03.000000000 +0100 @@ -1,8 +1,25 @@ /* * linux/mm/mmap.c - * * Written by obz. + * + * Address space accounting code + * (c) Copyright 2002 Red Hat Inc, All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + #include #include #include @@ -45,8 +62,10 @@ __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; -int sysctl_overcommit_memory; +int sysctl_overcommit_memory = 0; /* default is heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ int max_map_count = DEFAULT_MAX_MAP_COUNT; +atomic_t vm_committed_space = ATOMIC_INIT(0); /* Check that a process has enough memory to allocate a * new virtual mapping. @@ -56,42 +75,107 @@ /* Stupid algorithm to decide if we have enough memory: while * simple, it hopefully works in most obvious cases.. Easy to * fool it, but this should catch most mistakes. - */ - /* 23/11/98 NJC: Somewhat less stupid version of algorithm, + * + * 23/11/98 NJC: Somewhat less stupid version of algorithm, * which tries to do "TheRightThing". Instead of using half of * (buffers+cache), use the minimum values. Allow an extra 2% * of num_physpages for safety margin. + * + * 2002/02/26 Alan Cox: Added two new modes that do real accounting */ + unsigned long free, allowed; + struct sysinfo i; - unsigned long free; + atomic_add(pages, &vm_committed_space); /* Sometimes we want to use more memory than we have. */ - if (sysctl_overcommit_memory) - return 1; + if (sysctl_overcommit_memory == 1) + return 1; + + if (sysctl_overcommit_memory == 0) + { + /* The page cache contains buffer pages these days.. */ + free = page_cache_size; + free += nr_free_pages(); + free += nr_swap_pages; + + /* + * This double-counts: the nrpages are both in the page-cache + * and in the swapper space. At the same time, this compensates + * for the swap-space over-allocation (ie "nr_swap_pages" being + * too small. + */ + free += swapper_space.nrpages; + + /* + * The code below doesn't account for free space in the inode + * and dentry slab cache, slab cache fragmentation, inodes and + * dentries which will become freeable under VM load, etc. + * Lets just hope all these (complex) factors balance out... + */ + free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; + free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; + + if(free > pages) + return 1; + atomic_sub(pages, &vm_committed_space); + return 0; + } - /* The page cache contains buffer pages these days.. */ - free = page_cache_size; - free += nr_free_pages(); - free += nr_swap_pages; + /* FIXME - need to add arch hooks to get the bits we need + without the higher overhead crap */ + si_meminfo(&i); + allowed = i.totalram * sysctl_overcommit_ratio / 100; + allowed += total_swap_pages; + + if(atomic_read(&vm_committed_space) < allowed) + return 1; - /* - * This double-counts: the nrpages are both in the page-cache - * and in the swapper space. At the same time, this compensates - * for the swap-space over-allocation (ie "nr_swap_pages" being - * too small. - */ - free += swapper_space.nrpages; + atomic_sub(pages, &vm_committed_space); + return 0; +} - /* - * The code below doesn't account for free space in the inode - * and dentry slab cache, slab cache fragmentation, inodes and - * dentries which will become freeable under VM load, etc. - * Lets just hope all these (complex) factors balance out... - */ - free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; - free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; +void vm_unacct_memory(long pages) +{ + atomic_sub(pages, &vm_committed_space); +} - return free > pages; +/* + * Don't even bother telling me the locking is wrong - its a test + * routine and uniprocessor is quite sufficient.. + * + * To enable this debugging you must tweak the #if below, and build + * with no SYS5 shared memory (thats not validated yet) and non SMP + */ + +void vm_validate_enough(char *x) +{ +#if 0 + unsigned long count = 0UL; + struct mm_struct *mm; + struct vm_area_struct *vma; + struct list_head *mmp; + unsigned long flags; + + spin_lock_irqsave(&mmlist_lock, flags); + + list_for_each(mmp, &init_mm.mmlist) + { + mm = list_entry(mmp, struct mm_struct, mmlist); + for(vma = mm->mmap; vma!=NULL; vma=vma->vm_next) + { + if(vma->vm_flags & VM_ACCOUNT) + count += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + } + } + if(count != atomic_read(&vm_committed_space)) + { + printk("MM crappo accounting %s: %lu %ld.\n", + x, count, atomic_read(&vm_committed_space)); + atomic_set(&vm_committed_space, count); + } + spin_unlock_irqrestore(&mmlist_lock, flags); +#endif } /* Remove one vm structure from the inode's i_mapping address space. */ @@ -168,6 +252,7 @@ } /* Check against rlimit.. */ + /* FIXME: - this seems to be checked in do_brk.. */ rlim = current->rlim[RLIMIT_DATA].rlim_cur; if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) goto out; @@ -176,10 +261,6 @@ if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) goto out; - /* Check if we have enough memory.. */ - if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) - goto out; - /* Ok, looks good - let it rip. */ if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; @@ -442,29 +523,9 @@ int correct_wcount = 0; int error; rb_node_t ** rb_link, * rb_parent; + unsigned long charged = 0; -#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) - struct vm_area_struct * vma_m = NULL; - - if (flags & MAP_MIRROR) { - /* PaX: sanity checks, to be removed when proved to be stable */ - if (file || len || ((flags & MAP_TYPE) != MAP_PRIVATE)) - return -EINVAL; - - vma_m = find_vma(mm, pgoff); - - if (!vma_m || - vma_m->vm_start != pgoff || - (vma_m->vm_flags & VM_MIRROR) || - (!(vma_m->vm_flags & VM_WRITE) && (prot & PROT_WRITE))) - return -EINVAL; - - file = vma_m->vm_file; - pgoff = vma_m->vm_pgoff; - len = vma_m->vm_end - vma_m->vm_start; - } -#endif - + vm_validate_enough("entering do_mmap_pgoff"); if (file) { if (!file->f_op || !file->f_op->mmap) return -ENODEV; @@ -534,6 +595,28 @@ return -EAGAIN; } +#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) + struct vm_area_struct * vma_m = NULL; + + if (flags & MAP_MIRROR) { + /* PaX: sanity checks, to be removed when proved to be stable */ + if (file || len || ((flags & MAP_TYPE) != MAP_PRIVATE)) + return -EINVAL; + + vma_m = find_vma(mm, pgoff); + + if (!vma_m || + vma_m->vm_start != pgoff || + (vma_m->vm_flags & VM_MIRROR) || + (!(vma_m->vm_flags & VM_WRITE) && (prot & PROT_WRITE))) + return -EINVAL; + + file = vma_m->vm_file; + pgoff = vma_m->vm_pgoff; + len = vma_m->vm_end - vma_m->vm_start; + } +#endif + if (file) { switch (flags & MAP_TYPE) { case MAP_SHARED: @@ -588,11 +671,15 @@ > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; - /* Private writable mapping? Check memory availability.. */ - if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && - !(flags & MAP_NORESERVE) && - !vm_enough_memory(len >> PAGE_SHIFT)) - return -ENOMEM; + if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) { + if ((vm_flags & (VM_SHARED|VM_WRITE)) == VM_WRITE) { + /* Private writable mapping: check memory availability */ + charged = len >> PAGE_SHIFT; + if (!vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + } /* Can we just expand an old anonymous mapping? */ if (!file && !(vm_flags & VM_SHARED) && rb_parent) @@ -604,8 +691,9 @@ * not unmapped, but the maps are removed from the list. */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + error = -ENOMEM; if (!vma) - return -ENOMEM; + goto unacct_error; vma->vm_mm = mm; vma->vm_start = addr; @@ -673,8 +761,7 @@ * to update the tree pointers. */ addr = vma->vm_start; - stale_vma = find_vma_prepare(mm, addr, &prev, - &rb_link, &rb_parent); + stale_vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); /* * Make sure the lowlevel driver did its job right. */ @@ -695,6 +782,7 @@ mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); } + vm_validate_enough("out from do_mmap_pgoff"); return addr; unmap_and_free_vma: @@ -707,6 +795,10 @@ zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); free_vma: kmem_cache_free(vm_area_cachep, vma); +unacct_error: + if(charged) + vm_unacct_memory(charged); + vm_validate_enough("error path from do_mmap_pgoff"); return error; } @@ -865,6 +957,194 @@ return NULL; } +/* vma is the first one with address < vma->vm_end, + * and even address < vma->vm_start. Have to extend vma. */ + +#ifdef ARCH_STACK_GROWSUP +static inline int expand_stack(struct vm_area_struct * vma, unsigned long address) +{ + unsigned long grow; + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + vm_validate_enough("entering expand_stack"); + +#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) + struct vm_area_struct * vma_m = NULL; + unsigned long address_m = 0UL; +#endif + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * page_table_lock lock to serialize against concurrent expand_stacks. + */ + spin_lock(&vma->vm_mm->page_table_lock); + + address += 4 + PAGE_SIZE - 1; + address &= PAGE_MASK; + + /* already expanded while we were spinning? */ + if (vma->vm_start <= address) { + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; + } + + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + /* Overcommit... */ + if (!vm_enough_memory(grow)) { + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + +#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) + if (vma->vm_flags & VM_MIRROR) { + address_m = vma->vm_start + vma->vm_mirror; + vma_m = find_vma(vma->vm_mm, address_m); + if (!vma_m || vma_m->vm_start != address_m || !(vma_m->vm_flags & VM_MIRROR) || + vma->vm_end - vma->vm_start != vma_m->vm_end - vma_m->vm_start) { + printk(KERN_ERR "PAX: VMMIRROR: expand bug, %08lx, %08lx, %08lx, %08lx, %08lx\n", + address, vma->vm_start, vma_m->vm_start, vma->vm_end, vma_m->vm_end); + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + + address_m = address + vma->vm_mirror; + if (2*grow < grow || vma_m->vm_end - address_m > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma_m->vm_mm->total_vm + 2*grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur || + ((vma_m->vm_flags & VM_LOCKED) && + ((vma_m->vm_mm->locked_vm + 2*grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur)) { + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + } else +#endif + + if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur || + ((vma->vm_flags & VM_LOCKED) && + ((vma->vm_mm->locked_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur)) { + spin_unlock(&vma->vm_mm->page_table_lock); + vm_unacct_memory(grow); + vm_validate_enough("exiting expand_stack - FAIL"); + return -ENOMEM; + } + + vma->vm_end = address; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + +#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) + if (vma->vm_flags & VM_MIRROR) { + vma_m->vm_start = address_m; + vma_m->vm_pgoff -= grow; + vma_m->vm_mm->total_vm += grow; + if (vma_m->vm_flags & VM_LOCKED) + vma_m->vm_mm->locked_vm += grow; + } +#endif + + spin_unlock(&vma->vm_mm->page_table_lock); + vm_validate_enough("exiting expand_stack"); + return 0; +} +#else + +int expand_stack(struct vm_area_struct * vma, unsigned long address) +{ + unsigned long grow; + + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + + vm_validate_enough("entering expand_stack"); + +#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) + struct vm_area_struct * vma_m = NULL; + unsigned long address_m = 0UL; +#endif + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * page_table_lock lock to serialize against concurrent expand_stacks. + */ + address &= PAGE_MASK; + spin_lock(&vma->vm_mm->page_table_lock); + + /* already expanded while we were spinning? */ + if (vma->vm_start <= address) { + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; + } + + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + /* Overcommit... */ + if (!vm_enough_memory(grow)) { + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + + +#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) + if (vma->vm_flags & VM_MIRROR) { + address_m = vma->vm_start + vma->vm_mirror; + vma_m = find_vma(vma->vm_mm, address_m); + if (!vma_m || vma_m->vm_start != address_m || !(vma_m->vm_flags & VM_MIRROR) || + vma->vm_end - vma->vm_start != vma_m->vm_end - vma_m->vm_start) { + printk(KERN_ERR "PAX: VMMIRROR: expand bug, %08lx, %08lx, %08lx, %08lx, %08lx\n", + address, vma->vm_start, vma_m->vm_start, vma->vm_end, vma_m->vm_end); + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + + address_m = address + vma->vm_mirror; + if (2*grow < grow || vma_m->vm_end - address_m > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma_m->vm_mm->total_vm + 2*grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur || + ((vma_m->vm_flags & VM_LOCKED) && + ((vma_m->vm_mm->locked_vm + 2*grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur)) { + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + } else +#endif + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur || + ((vma->vm_flags & VM_LOCKED) && + ((vma->vm_mm->locked_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur)) { + spin_unlock(&vma->vm_mm->page_table_lock); + vm_unacct_memory(grow); + vm_validate_enough("exiting expand_stack - FAIL"); + return -ENOMEM; + } + + vma->vm_start = address; + vma->vm_pgoff -= grow; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + +#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) + if (vma->vm_flags & VM_MIRROR) { + vma_m->vm_start = address_m; + vma_m->vm_pgoff -= grow; + vma_m->vm_mm->total_vm += grow; + if (vma_m->vm_flags & VM_LOCKED) + vma_m->vm_mm->locked_vm += grow; + } +#endif + + spin_unlock(&vma->vm_mm->page_table_lock); + vm_validate_enough("exiting expand_stack"); + return 0; +} + +#endif + struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) { struct vm_area_struct * vma; @@ -920,6 +1200,8 @@ area->vm_mm->total_vm -= len >> PAGE_SHIFT; if (area->vm_flags & VM_LOCKED) area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_ACCOUNT) + vm_unacct_memory(len >> PAGE_SHIFT); /* Unmapping the whole area. */ if (addr == area->vm_start && end == area->vm_end) { @@ -1137,6 +1419,8 @@ { struct vm_area_struct *mpnt, *prev, **npp, *free, *extra; + vm_validate_enough("entering do_munmap"); + #if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_RANDEXEC) struct vm_area_struct *free_m, *extra_m; #endif @@ -1232,9 +1516,11 @@ kmem_cache_free(vm_area_cachep, extra_m); #endif - /* Release the extra vma struct if it wasn't used */ - if (extra) - kmem_cache_free(vm_area_cachep, extra); + /* Release the extra vma struct if it wasn't used */ + if (extra) + kmem_cache_free(vm_area_cachep, extra); + + vm_validate_enough("exit -ok- do_munmap"); return 0; } @@ -1301,6 +1587,9 @@ unsigned long flags, task_size = TASK_SIZE; rb_node_t ** rb_link, * rb_parent; + vm_validate_enough("entering do_brk"); + + len = PAGE_ALIGN(len); if (!len) return addr; @@ -1351,7 +1640,7 @@ if (!vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; - flags = VM_DATA_DEFAULT_FLAGS | mm->def_flags; + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; #if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) if (mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { @@ -1374,8 +1663,11 @@ */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) + { + /* We accounted this address space - undo it */ + vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; - + } vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -1402,6 +1694,9 @@ mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); } + + vm_validate_enough("exiting do_brk"); + return addr; } @@ -1443,6 +1738,10 @@ unsigned long end = mpnt->vm_end; unsigned long size = end - start; + /* If the VMA has been charged for, account for its removal */ + if (mpnt->vm_flags & VM_ACCOUNT) + vm_unacct_memory(size >> PAGE_SHIFT); + if (mpnt->vm_ops) { if (mpnt->vm_ops->close) mpnt->vm_ops->close(mpnt); @@ -1461,8 +1760,9 @@ BUG(); clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); - flush_tlb_mm(mm); + vm_validate_enough("exiting exit_mmap"); + } /* Insert vm structure into process list sorted by address diff -urN linux-2.4.33-pre2-pax/mm/mprotect.c linux-2.4.33-pre2-oc+pax-fixed/mm/mprotect.c --- linux-2.4.33-pre2-pax/mm/mprotect.c 2006-03-01 00:02:58.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/mm/mprotect.c 2006-03-01 00:03:57.000000000 +0100 @@ -2,6 +2,23 @@ * linux/mm/mprotect.c * * (C) Copyright 1994 Linus Torvalds + * + * Address space accounting code + * (c) Copyright 2002 Red Hat Inc, All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include @@ -288,11 +305,28 @@ { pgprot_t newprot; int error; + unsigned long charged = 0; if (newflags == vma->vm_flags) { *pprev = vma; return 0; } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. + * + * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting + * a MAP_NORESERVE private mapping to writable will now reserve. + */ + if ((newflags & VM_WRITE) && + !(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { + charged = (end - start) >> PAGE_SHIFT; + if (!vm_enough_memory(charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } #endif #ifdef CONFIG_PAX_PAGEEXEC @@ -311,10 +345,10 @@ error = mprotect_fixup_end(vma, pprev, start, newflags, newprot); else error = mprotect_fixup_middle(vma, pprev, start, end, newflags, newprot); - - if (error) + if (error) { + vm_unacct_memory(charged); return error; - + } change_protection(start, end, newprot); return 0; } @@ -387,6 +421,8 @@ struct vm_area_struct * vma, * next, * prev; int error = -EINVAL; + vm_validate_enough("entering mprotect"); + if (start & ~PAGE_MASK) return -EINVAL; len = PAGE_ALIGN(len); @@ -472,5 +508,6 @@ } out: up_write(¤t->mm->mmap_sem); + vm_validate_enough("exiting mprotect"); return error; } diff -urN linux-2.4.33-pre2-pax/mm/mremap.c linux-2.4.33-pre2-oc+pax-fixed/mm/mremap.c --- linux-2.4.33-pre2-pax/mm/mremap.c 2006-03-01 00:02:58.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/mm/mremap.c 2006-03-01 00:36:30.000000000 +0100 @@ -2,6 +2,23 @@ * linux/mm/remap.c * * (C) Copyright 1996 Linus Torvalds + * + * Address space accounting code + * (c) Copyright 2002 Red Hat Inc, All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include @@ -13,8 +30,6 @@ #include #include -extern int vm_enough_memory(long pages); - static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) { pgd_t * pgd; @@ -133,6 +148,9 @@ struct mm_struct * mm = vma->vm_mm; struct vm_area_struct * new_vma, * next, * prev; int allocated_vma; + unsigned long excess = 0; + int split = 0; + new_vma = NULL; next = find_vma_prev(mm, new_addr, &prev); @@ -187,7 +205,7 @@ *new_vma = *vma; new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; new_vma->vm_raend = 0; if (new_vma->vm_file) get_file(new_vma->vm_file); @@ -197,9 +215,35 @@ } /* XXX: possible errors masked, mapping might remain */ - do_munmap(current->mm, addr, old_len); + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vma->vm_flags & VM_ACCOUNT) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (addr > vma->vm_start && + addr + old_len < vma->vm_end) + split = 1; + } + /* + * if we failed to move page tables we still do total_vm + * increment since do_munmap() will decrement it by + * old_len == new_len + */ current->mm->total_vm += new_len >> PAGE_SHIFT; + + if (do_munmap(mm, addr, old_len) < 0) { + /* OOM: unable to split vma, just get accounts right */ + vm_unacct_memory(excess >> PAGE_SHIFT); + excess = 0; + } + + /* Restore VM_ACCOUNT if one or two pieces of vma left */ + if (excess) { + vma->vm_flags |= VM_ACCOUNT; + if (split) + vma->vm_next->vm_flags |= VM_ACCOUNT; + } + if (vm_locked) { current->mm->locked_vm += new_len >> PAGE_SHIFT; if (new_len > old_len) @@ -226,7 +270,8 @@ unsigned long flags, unsigned long new_addr) { struct vm_area_struct *vma; - unsigned long ret = -EINVAL; + unsigned long ret = -EINVAL, task_size = TASK_SIZE; + unsigned long charged = 0; if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) goto out; @@ -300,6 +345,7 @@ /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. + * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { ret = do_munmap(current->mm, addr+new_len, old_len - new_len); @@ -345,11 +391,12 @@ if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) > current->rlim[RLIMIT_AS].rlim_cur) goto out; - /* Private writable mapping? Check memory availability.. */ - if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && - !(flags & MAP_NORESERVE) && - !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) - goto out; + + if (vma->vm_flags & VM_ACCOUNT) { + charged = (new_len - old_len) >> PAGE_SHIFT; + if(!vm_enough_memory(charged)) + goto out_nc; + } /* old_len exactly to the end of the area.. * And we're not relocating the area. @@ -373,6 +420,7 @@ addr + new_len); } ret = addr; + vm_validate_enough("mremap path1"); goto out; } } @@ -396,6 +444,12 @@ ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: + if(ret & ~PAGE_MASK) + { + vm_unacct_memory(charged); + vm_validate_enough("mremap error path"); + } +out_nc: return ret; } @@ -405,8 +459,10 @@ { unsigned long ret; + vm_validate_enough("entry to mremap"); down_write(¤t->mm->mmap_sem); ret = do_mremap(addr, old_len, new_len, flags, new_addr); up_write(¤t->mm->mmap_sem); + vm_validate_enough("exit from mremap"); return ret; } diff -urN linux-2.4.33-pre2-pax/mm/shmem.c linux-2.4.33-pre2-oc+pax-fixed/mm/shmem.c --- linux-2.4.33-pre2-pax/mm/shmem.c 2004-11-17 12:54:22.000000000 +0100 +++ linux-2.4.33-pre2-oc+pax-fixed/mm/shmem.c 2006-03-01 00:02:26.000000000 +0100 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -395,10 +396,20 @@ { struct inode *inode = dentry->d_inode; struct page *page = NULL; + long change = 0; int error; - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size < inode->i_size) { + if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size <= SHMEM_MAX_BYTES)) { + /* + * Account swap file usage based on new file size, + * but just let vmtruncate fail on out-of-range sizes. + */ + change = VM_ACCT(attr->ia_size) - VM_ACCT(inode->i_size); + if (change > 0) { + if (!vm_enough_memory(change)) + return -ENOMEM; + } else if (attr->ia_size < inode->i_size) { + vm_unacct_memory(-change); /* * If truncating down to a partial page, then * if that page is already allocated, hold it @@ -432,6 +443,8 @@ error = inode_setattr(inode, attr); if (page) page_cache_release(page); + if (error) + vm_unacct_memory(change); return error; } @@ -444,6 +457,7 @@ spin_lock(&shmem_ilock); list_del(&info->list); spin_unlock(&shmem_ilock); + vm_unacct_memory(VM_ACCT(inode->i_size)); inode->i_size = 0; shmem_truncate(inode); } @@ -974,6 +988,7 @@ loff_t pos; unsigned long written; ssize_t err; + loff_t maxpos; if ((ssize_t) count < 0) return -EINVAL; @@ -990,6 +1005,15 @@ if (err || !count) goto out; + maxpos = inode->i_size; + if (maxpos < pos + count) { + maxpos = pos + count; + if (!vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) { + err = -ENOMEM; + goto out; + } + } + remove_suid(inode); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -998,12 +1022,15 @@ unsigned long bytes, index, offset; char *kaddr; int left; + int deactivate = 1; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + deactivate = 0; + } /* * We don't hold page lock across copy from user - @@ -1028,6 +1055,12 @@ flush_dcache_page(page); SetPageDirty(page); SetPageReferenced(page); +#ifdef PG_inactive_dirty + if (deactivate) + deactivate_page(page); + else + mark_page_accessed(page); +#endif page_cache_release(page); if (left) { @@ -1041,6 +1074,10 @@ *ppos = pos; if (written) err = written; + + /* Short writes give back address space */ + if (inode->i_size != maxpos) + vm_unacct_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size)); out: up(&inode->i_sem); return err; @@ -1364,8 +1401,13 @@ memcpy(info, symname, len); inode->i_op = &shmem_symlink_inline_operations; } else { + if (!vm_enough_memory(VM_ACCT(1))) { + iput(inode); + return -ENOMEM; + } error = shmem_getpage(inode, 0, &page, SGP_WRITE); if (error) { + vm_unacct_memory(VM_ACCT(1)); iput(inode); return error; } @@ -1684,7 +1726,6 @@ struct inode *inode; struct dentry *dentry, *root; struct qstr this; - int vm_enough_memory(long pages); if (IS_ERR(shm_mnt)) return (void *)shm_mnt; @@ -1695,13 +1736,14 @@ if (!vm_enough_memory(VM_ACCT(size))) return ERR_PTR(-ENOMEM); + error = -ENOMEM; this.name = name; this.len = strlen(name); this.hash = 0; /* will go */ root = shm_mnt->mnt_root; dentry = d_alloc(root, &this); if (!dentry) - return ERR_PTR(-ENOMEM); + goto put_memory; error = -ENFILE; file = get_empty_filp(); @@ -1726,6 +1768,8 @@ put_filp(file); put_dentry: dput(dentry); +put_memory: + vm_unacct_memory(VM_ACCT(size)); return ERR_PTR(error); }