Addresses the problem where all of ZONE_NORMAL is full of buffer_heads. Normal page reclaim will view all this memory as non-freeable. So what the patch does is, while scanning the LRU pages, also check whether a page in the wrong zone has buffers in the *right* zone. If it does, then strip the page's buffers but leave the page alone. hmm. Are we sure that we subsequently call the right slabcache shrink function to actually free the page which backed those buffer_heads? We discussed making this code conditional on CONFIG_HIGHMEM64G only, as it's not been an observed problem on any other configs. Theoretically, the same problem could occur with ZONE_DMA. The code is left in. Testing with 50/50 highmem/normal shows that memclass_related_bhs() is almost never called - presumably it will be called more often when the highmem/normal ratio is higher. But it'll only be called for allocations which are specifically asking for ZONE_NORMAL - mainly fs metadata. Bottom line: the code works and the CPU cost is negligible. ===================================== --- 2.4.19-pre4/mm/vmscan.c~aa-230-free_zone_bhs Tue Mar 26 23:11:38 2002 +++ 2.4.19-pre4-akpm/mm/vmscan.c Tue Mar 26 23:11:38 2002 @@ -352,10 +352,30 @@ static int swap_out(zone_t * classzone) return 1; } while (--counter >= 0); + out: + if (unlikely(vm_gfp_debug)) { + printk(KERN_NOTICE "swap_out: failed\n"); + dump_stack(); + } return 0; empty: spin_unlock(&mmlist_lock); + goto out; +} + +static int FASTCALL(memclass_related_bhs(struct page * page, zone_t * classzone)); +static int memclass_related_bhs(struct page * page, zone_t * classzone) +{ + struct buffer_head * tmp, * bh = page->buffers; + + tmp = bh; + do { + if (memclass(page_zone(virt_to_page(tmp)), classzone)) + return 1; + tmp = tmp->b_this_page; + } while (tmp != bh); + return 0; } @@ -369,6 +389,7 @@ static int shrink_cache(int nr_pages, zo while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) { struct page * page; + int only_metadata; if (unlikely(current->need_resched)) { spin_unlock(&pagemap_lru_lock); @@ -395,8 +416,30 @@ static int shrink_cache(int nr_pages, zo if (unlikely(!page_count(page))) continue; - if (!memclass(page_zone(page), classzone)) + only_metadata = 0; + if (!memclass(page_zone(page), classzone)) { + /* + * Hack to address an issue found by Rik. The problem is that + * highmem pages can hold buffer headers allocated + * from the slab on lowmem, and so if we are working + * on the NORMAL classzone here, it is correct not to + * try to free the highmem pages themself (that would be useless) + * but we must make sure to drop any lowmem metadata related to those + * highmem pages. + */ + if (page->buffers && page->mapping) { /* fast path racy check */ + if (unlikely(TryLockPage(page))) + continue; + if (page->buffers && page->mapping && memclass_related_bhs(page, classzone)) { /* non racy check */ + only_metadata = 1; + goto free_bhs; + } + UnlockPage(page); + } continue; + } + + max_scan--; /* Racy check to avoid trylocking when not worthwhile */ if (!page->buffers && (page_count(page) != 1 || !page->mapping)) @@ -449,6 +492,7 @@ static int shrink_cache(int nr_pages, zo * the page as well. */ if (page->buffers) { + free_bhs: spin_unlock(&pagemap_lru_lock); /* avoid to free a locked page */ @@ -481,6 +525,10 @@ static int shrink_cache(int nr_pages, zo page_cache_release(page); spin_lock(&pagemap_lru_lock); + if (only_metadata) { + UnlockPage(page); + continue; + } } } else { /* failed to drop the buffers so stop here */ @@ -582,16 +630,40 @@ static void refill_inactive(int nr_pages entry = active_list.prev; while (ratio && entry != &active_list) { struct page * page; + int related_metadata = 0; page = list_entry(entry, struct page, lru); entry = entry->prev; + + if (!memclass(page_zone(page), classzone)) { + /* + * Hack to address an issue found by Rik. The problem is that + * highmem pages can hold buffer headers allocated + * from the slab on lowmem, and so if we are working + * on the NORMAL classzone here, it is correct not to + * try to free the highmem pages themself (that would be useless) + * but we must make sure to drop any lowmem metadata related to those + * highmem pages. + */ + if (page->buffers && page->mapping) { /* fast path racy check */ + if (unlikely(TryLockPage(page))) + continue; + if (page->buffers && page->mapping && memclass_related_bhs(page, classzone)) /* non racy check */ + related_metadata = 1; + UnlockPage(page); + } + if (!related_metadata) + continue; + } + if (PageTestandClearReferenced(page)) { list_del(&page->lru); list_add(&page->lru, &active_list); continue; } - nr_pages--; + if (!related_metadata) + ratio--; del_page_from_active_list(page); add_page_to_inactive_list(page);