--- linux-2.4.18-pre6/fs/reiserfs/stree.c Fri Dec 21 11:19:23 2001 +++ linux-akpm/fs/reiserfs/stree.c Wed Jan 23 11:13:00 2002 @@ -648,9 +648,8 @@ int search_by_key (struct super_block * stop at leaf level - set to DISK_LEAF_NODE_LEVEL */ ) { - int n_block_number = SB_ROOT_BLOCK (p_s_sb), - expected_level = SB_TREE_HEIGHT (p_s_sb), - n_block_size = p_s_sb->s_blocksize; + int n_block_number, expected_level; + int n_block_size = p_s_sb->s_blocksize; struct buffer_head * p_s_bh; struct path_element * p_s_last_element; int n_node_level, n_retval; @@ -662,7 +661,8 @@ int search_by_key (struct super_block * #endif PROC_INFO_INC( p_s_sb, search_by_key ); - + conditional_schedule(); + /* As we add each node to a path we increase its count. This means that we must be careful to release all nodes in a path before we either discard the path struct or re-use the path struct, as we do here. */ @@ -674,6 +674,8 @@ int search_by_key (struct super_block * /* With each iteration of this loop we search through the items in the current node, and calculate the next current node(next path element) for the next iteration of this loop.. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = SB_TREE_HEIGHT (p_s_sb); while ( 1 ) { #ifdef CONFIG_REISERFS_CHECK @@ -1099,6 +1101,8 @@ static char prepare_for_delete_or_cut( for (n_counter = *p_n_removed; n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + + conditional_schedule(); if (item_moved (&s_ih, p_s_path)) { need_research = 1 ; --- linux-2.4.18-pre6/include/linux/low-latency.h Thu Jan 1 00:00:00 1970 +++ linux-akpm/include/linux/low-latency.h Wed Jan 23 11:13:00 2002 @@ -0,0 +1,109 @@ +/* + * include/linux/low-latency.h + * + * Andrew Morton + */ + +#ifndef LOW_LATENCY_H_INCLUDED +#define LOW_LATENCY_H_INCLUDED + +#if defined(CONFIG_LOLAT) +#define LOWLATENCY_NEEDED 1 +#else +#define LOWLATENCY_NEEDED 0 +#endif + +#if LOWLATENCY_NEEDED + +#include /* For ____cacheline_aligned */ + +#ifdef CONFIG_LOLAT_SYSCTL +extern struct low_latency_enable_struct { + int yep; +} ____cacheline_aligned __enable_lowlatency; +#define enable_lowlatency __enable_lowlatency.yep + +#else +#define enable_lowlatency 1 +#endif + +/* + * Set this non-zero to generate low-latency instrumentation + */ +#define LOWLATENCY_DEBUG 0 + +/* + * Set this non-zero for robustness testing + */ +#define LOWLATENCY_ALWAYS_SCHEDULE 0 + +#if LOWLATENCY_DEBUG + +#if LOWLATENCY_ALWAYS_SCHEDULE +#define conditional_schedule_needed() ((enable_lowlatency == 2) || (enable_lowlatency && current->need_resched)) +#else +#define conditional_schedule_needed() (enable_lowlatency && current->need_resched) +#endif + +struct lolat_stats_t { + unsigned long count; + int visited; + const char *file; + int line; + struct lolat_stats_t *next; +}; + +void set_running_and_schedule(struct lolat_stats_t *stats); + +#define unconditional_schedule() \ + do { \ + static struct lolat_stats_t stats = { \ + file: __FILE__, \ + line: __LINE__, \ + }; \ + set_running_and_schedule(&stats); \ + } while (0) + +extern void show_lolat_stats(void); + +#else /* LOWLATENCY_DEBUG */ + +#if LOWLATENCY_ALWAYS_SCHEDULE +#define conditional_schedule_needed() 1 +#else +#define conditional_schedule_needed() (current->need_resched) +#endif + +void set_running_and_schedule(void); +#define unconditional_schedule() set_running_and_schedule() + +#endif /* LOWLATENCY_DEBUG */ + +#define conditional_schedule() \ + do { \ + if (conditional_schedule_needed()) \ + unconditional_schedule(); \ + } while (0) + +#define DEFINE_RESCHED_COUNT int resched_count = 0 +#define TEST_RESCHED_COUNT(n) (enable_lowlatency && (++resched_count > (n))) +#define RESET_RESCHED_COUNT() resched_count = 0 +extern int ll_copy_to_user(void *to_user, const void *from, unsigned long len); +extern int ll_copy_from_user(void *to, const void *from_user, unsigned long len); + +#else /* LOWLATENCY_NEEDED */ + +#define conditional_schedule_needed() 0 +#define conditional_schedule() +#define unconditional_schedule() + +#define DEFINE_RESCHED_COUNT +#define TEST_RESCHED_COUNT(n) 0 +#define RESET_RESCHED_COUNT() +#define ll_copy_to_user(to_user, from, len) copy_to_user((to_user), (from), (len)) +#define ll_copy_from_user(to, from_user, len) copy_from_user((to), (from_user), (len)) + +#endif /* LOWLATENCY_NEEDED */ + +#endif /* LOW_LATENCY_H_INCLUDED */ + --- linux-2.4.18-pre6/include/linux/mm.h Fri Dec 21 11:19:23 2001 +++ linux-akpm/include/linux/mm.h Wed Jan 23 11:14:34 2002 @@ -121,6 +121,8 @@ extern int vm_max_readahead; */ extern pgprot_t protection_map[16]; +/* Actions for zap_page_range() */ +#define ZPR_COND_RESCHED 1 /* Do a conditional_schedule() occasionally */ /* * These are the virtual MM functions - opening of an area, closing and @@ -404,7 +406,7 @@ struct file *shmem_file_setup(char * nam extern void shmem_lock(struct file * file, int lock); extern int shmem_zero_setup(struct vm_area_struct *); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); --- linux-2.4.18-pre6/include/linux/reiserfs_fs.h Fri Dec 21 11:19:23 2001 +++ linux-akpm/include/linux/reiserfs_fs.h Wed Jan 23 11:16:40 2002 @@ -1155,8 +1155,8 @@ static inline loff_t max_reiserfs_offset #define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter) #define get_generation(s) atomic_read (&fs_generation(s)) #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) -#define fs_changed(gen,s) (gen != get_generation (s)) - +#define __fs_changed(gen,s) (gen != get_generation (s)) +#define fs_changed(gen,s) ({conditional_schedule(); __fs_changed(gen,s);}) /***************************************************************************/ /* FIXATE NODES */ --- linux-2.4.18-pre6/include/linux/sched.h Fri Dec 21 11:19:23 2001 +++ linux-akpm/include/linux/sched.h Wed Jan 23 11:14:33 2002 @@ -26,6 +26,7 @@ extern unsigned long event; #include #include #include +#include struct exec_domain; --- linux-2.4.18-pre6/include/linux/sysctl.h Mon Nov 26 11:52:07 2001 +++ linux-akpm/include/linux/sysctl.h Wed Jan 23 11:14:33 2002 @@ -124,6 +124,7 @@ enum KERN_CORE_USES_PID=52, /* int: use core or core.%pid */ KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */ + KERN_LOWLATENCY=55, /* int: enable low latency scheduling */ }; --- linux-2.4.18-pre6/arch/i386/config.in Fri Dec 21 11:19:13 2001 +++ linux-akpm/arch/i386/config.in Wed Jan 23 11:13:00 2002 @@ -26,6 +26,9 @@ endmenu mainmenu_option next_comment comment 'Processor type and features' +bool 'Low latency scheduling' CONFIG_LOLAT +dep_bool 'Control low latency with sysctl' CONFIG_LOLAT_SYSCTL $CONFIG_LOLAT + choice 'Processor family' \ "386 CONFIG_M386 \ 486 CONFIG_M486 \ --- linux-2.4.18-pre6/drivers/block/ll_rw_blk.c Tue Jan 22 12:38:29 2002 +++ linux-akpm/drivers/block/ll_rw_blk.c Wed Jan 23 11:13:00 2002 @@ -917,6 +917,7 @@ void submit_bh(int rw, struct buffer_hea kstat.pgpgin += count; break; } + conditional_schedule(); } /** --- linux-2.4.18-pre6/drivers/char/mem.c Fri Dec 21 11:19:13 2001 +++ linux-akpm/drivers/char/mem.c Wed Jan 23 11:13:00 2002 @@ -400,7 +400,7 @@ static inline size_t read_zero_pagealign if (count > size) count = size; - zap_page_range(mm, addr, count); + zap_page_range(mm, addr, count, 0); zeromap_page_range(addr, count, PAGE_COPY); size -= count; --- linux-2.4.18-pre6/drivers/char/random.c Thu Nov 22 23:02:57 2001 +++ linux-akpm/drivers/char/random.c Wed Jan 23 11:13:00 2002 @@ -1369,6 +1369,11 @@ static ssize_t extract_entropy(struct en buf += i; ret += i; add_timer_randomness(&extract_timer_state, nbytes); +#if LOWLATENCY_NEEDED + /* This can happen in softirq's, but that's what we want */ + if (conditional_schedule_needed()) + break; +#endif } /* Wipe data just returned from memory */ --- linux-2.4.18-pre6/fs/buffer.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/fs/buffer.c Wed Jan 23 11:13:00 2002 @@ -204,8 +204,10 @@ static int write_some_buffers(kdev_t dev if (dev && bh->b_dev != dev) continue; - if (test_and_set_bit(BH_Lock, &bh->b_state)) + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + __refile_buffer(bh); continue; + } if (atomic_set_buffer_clean(bh)) { __refile_buffer(bh); get_bh(bh); @@ -215,6 +217,7 @@ static int write_some_buffers(kdev_t dev spin_unlock(&lru_list_lock); write_locked_buffers(array, count); + conditional_schedule(); return -EAGAIN; } unlock_buffer(bh); @@ -249,12 +252,19 @@ static int wait_for_buffers(kdev_t dev, struct buffer_head * next; int nr; - next = lru_list[index]; nr = nr_buffers_type[index]; +repeat: + next = lru_list[index]; while (next && --nr >= 0) { struct buffer_head *bh = next; next = bh->b_next_free; + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); + spin_lock(&lru_list_lock); + goto repeat; + } if (!buffer_locked(bh)) { if (refile) __refile_buffer(bh); @@ -262,7 +272,11 @@ static int wait_for_buffers(kdev_t dev, } if (dev && bh->b_dev != dev) continue; - + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); + return -EAGAIN; + } get_bh(bh); spin_unlock(&lru_list_lock); wait_on_buffer (bh); @@ -273,7 +287,7 @@ static int wait_for_buffers(kdev_t dev, return 0; } -static inline void wait_for_some_buffers(kdev_t dev) +static void wait_for_some_buffers(kdev_t dev) { spin_lock(&lru_list_lock); wait_for_buffers(dev, BUF_LOCKED, 1); @@ -301,6 +315,15 @@ int sync_buffers(kdev_t dev, int wait) { int err = 0; +#if LOWLATENCY_NEEDED + /* + * syncing devA when there are lots of buffers dirty against + * devB is expensive. + */ + if (enable_lowlatency) + dev = NODEV; +#endif + /* One pass for no-wait, three for wait: * 0) write out all dirty, unlocked buffers; * 1) wait for all dirty locked buffers; @@ -682,6 +705,16 @@ void invalidate_bdev(struct block_device /* Not hashed? */ if (!bh->b_pprev) continue; + + if (conditional_schedule_needed()) { + get_bh(bh); + spin_unlock(&lru_list_lock); + unconditional_schedule(); + spin_lock(&lru_list_lock); + put_bh(bh); + slept = 1; + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -833,7 +866,8 @@ int fsync_inode_buffers(struct inode *in struct buffer_head *bh; struct inode tmp; int err = 0, err2; - + DEFINE_RESCHED_COUNT; + INIT_LIST_HEAD(&tmp.i_dirty_buffers); spin_lock(&lru_list_lock); @@ -854,8 +888,18 @@ int fsync_inode_buffers(struct inode *in spin_lock(&lru_list_lock); } } + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); /* Syncing many dirty buffers */ + spin_lock(&lru_list_lock); + } + } } + RESET_RESCHED_COUNT(); + while (!list_empty(&tmp.i_dirty_buffers)) { bh = BH_ENTRY(tmp.i_dirty_buffers.prev); remove_inode_queue(bh); @@ -882,12 +926,18 @@ int fsync_inode_data_buffers(struct inod struct buffer_head *bh; struct inode tmp; int err = 0, err2; - + INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); - + +repeat: spin_lock(&lru_list_lock); while (!list_empty(&inode->i_dirty_data_buffers)) { + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); + goto repeat; + } bh = BH_ENTRY(inode->i_dirty_data_buffers.next); list_del(&bh->b_inode_buffers); if (!buffer_dirty(bh) && !buffer_locked(bh)) @@ -914,6 +964,7 @@ int fsync_inode_data_buffers(struct inod if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); + conditional_schedule(); spin_lock(&lru_list_lock); } @@ -942,14 +993,23 @@ int osync_inode_buffers(struct inode *in struct buffer_head *bh; struct list_head *list; int err = 0; + DEFINE_RESCHED_COUNT; +repeat: + conditional_schedule(); spin_lock(&lru_list_lock); - repeat: - for (list = inode->i_dirty_buffers.prev; bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; list = bh->b_inode_buffers.prev) { + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -957,7 +1017,6 @@ int osync_inode_buffers(struct inode *in if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); - spin_lock(&lru_list_lock); goto repeat; } } @@ -971,14 +1030,23 @@ int osync_inode_data_buffers(struct inod struct buffer_head *bh; struct list_head *list; int err = 0; + DEFINE_RESCHED_COUNT; +repeat: + conditional_schedule(); spin_lock(&lru_list_lock); - - repeat: for (list = inode->i_dirty_data_buffers.prev; bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers; list = bh->b_inode_buffers.prev) { + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -986,7 +1054,6 @@ int osync_inode_data_buffers(struct inod if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); - spin_lock(&lru_list_lock); goto repeat; } } @@ -1004,12 +1071,24 @@ int osync_inode_data_buffers(struct inod void invalidate_inode_buffers(struct inode *inode) { struct list_head * entry; - + +repeat: + conditional_schedule(); spin_lock(&lru_list_lock); - while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) + while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) { + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } remove_inode_queue(BH_ENTRY(entry)); - while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) + } + while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) { + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } remove_inode_queue(BH_ENTRY(entry)); + } spin_unlock(&lru_list_lock); } @@ -1184,8 +1263,10 @@ struct buffer_head * bread(kdev_t dev, i bh = getblk(dev, block, size); touch_buffer(bh); - if (buffer_uptodate(bh)) + if (buffer_uptodate(bh)) { + conditional_schedule(); return bh; + } ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) @@ -2780,7 +2861,7 @@ void __init buffer_init(unsigned long me DECLARE_WAIT_QUEUE_HEAD(bdflush_wait); -void wakeup_bdflush(void) +void wakeup_bdflush(void) { wake_up_interruptible(&bdflush_wait); } --- linux-2.4.18-pre6/fs/dcache.c Fri Dec 21 11:19:14 2001 +++ linux-akpm/fs/dcache.c Wed Jan 23 11:13:00 2002 @@ -320,11 +320,23 @@ static inline void prune_one_dentry(stru void prune_dcache(int count) { + DEFINE_RESCHED_COUNT; + +redo: spin_lock(&dcache_lock); for (;;) { struct dentry *dentry; struct list_head *tmp; + if (TEST_RESCHED_COUNT(100)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&dcache_lock); + unconditional_schedule(); + goto redo; + } + } + tmp = dentry_unused.prev; if (tmp == &dentry_unused) @@ -479,6 +491,7 @@ static int select_parent(struct dentry * struct dentry *this_parent = parent; struct list_head *next; int found = 0; + DEFINE_RESCHED_COUNT; spin_lock(&dcache_lock); repeat: @@ -493,6 +506,13 @@ resume: list_add(&dentry->d_lru, dentry_unused.prev); found++; } + + if (TEST_RESCHED_COUNT(500) && found > 10) { + if (conditional_schedule_needed()) /* Typically sys_rmdir() */ + goto out; + RESET_RESCHED_COUNT(); + } + /* * Descend a level if the d_subdirs list is non-empty. */ @@ -517,6 +537,7 @@ this_parent->d_parent->d_name.name, this #endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } @@ -532,8 +553,10 @@ void shrink_dcache_parent(struct dentry { int found; - while ((found = select_parent(parent)) != 0) + while ((found = select_parent(parent)) != 0) { prune_dcache(found); + conditional_schedule(); /* Typically sys_rmdir() */ + } } /* --- linux-2.4.18-pre6/fs/exec.c Fri Dec 21 11:19:14 2001 +++ linux-akpm/fs/exec.c Wed Jan 23 11:13:00 2002 @@ -223,7 +223,7 @@ int copy_strings(int argc,char ** argv, if (new) memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len); } - err = copy_from_user(kaddr + offset, str, bytes_to_copy); + err = ll_copy_from_user(kaddr + offset, str, bytes_to_copy); kunmap(page); if (err) --- linux-2.4.18-pre6/fs/ext2/dir.c Mon Sep 17 13:16:30 2001 +++ linux-akpm/fs/ext2/dir.c Wed Jan 23 11:13:00 2002 @@ -148,6 +148,7 @@ static struct page * ext2_get_page(struc struct address_space *mapping = dir->i_mapping; struct page *page = read_cache_page(mapping, n, (filler_t*)mapping->a_ops->readpage, NULL); + conditional_schedule(); /* Scanning large directories */ if (!IS_ERR(page)) { wait_on_page(page); kmap(page); --- linux-2.4.18-pre6/fs/ext2/inode.c Thu Nov 22 23:02:58 2001 +++ linux-akpm/fs/ext2/inode.c Wed Jan 23 11:13:00 2002 @@ -715,8 +715,13 @@ static inline void ext2_free_data(struct { unsigned long block_to_free = 0, count = 0; unsigned long nr; + DEFINE_RESCHED_COUNT; for ( ; p < q ; p++) { + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + conditional_schedule(); + } nr = le32_to_cpu(*p); if (nr) { *p = 0; @@ -759,6 +764,7 @@ static void ext2_free_branches(struct in if (depth--) { int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); for ( ; p < q ; p++) { + conditional_schedule(); /* Deleting large files */ nr = le32_to_cpu(*p); if (!nr) continue; --- linux-2.4.18-pre6/fs/inode.c Fri Dec 21 11:19:14 2001 +++ linux-akpm/fs/inode.c Wed Jan 23 11:13:00 2002 @@ -229,6 +229,8 @@ static inline void __sync_one(struct ino filemap_fdatawait(inode->i_mapping); + conditional_schedule(); + spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; if (!(inode->i_state & I_FREEING)) { @@ -539,6 +541,7 @@ static void dispose_list(struct list_hea while ((inode_entry = head->next) != head) { + conditional_schedule(); list_del(inode_entry); inode = list_entry(inode_entry, struct inode, i_list); @@ -567,9 +570,22 @@ static int invalidate_list(struct list_h if (tmp == head) break; inode = list_entry(tmp, struct inode, i_list); + + if (conditional_schedule_needed()) { + atomic_inc(&inode->i_count); + spin_unlock(&inode_lock); + unconditional_schedule(); + spin_lock(&inode_lock); + atomic_dec(&inode->i_count); + } + if (inode->i_sb != sb) continue; + atomic_inc(&inode->i_count); + spin_unlock(&inode_lock); invalidate_inode_buffers(inode); + spin_lock(&inode_lock); + atomic_dec(&inode->i_count); if (!atomic_read(&inode->i_count)) { list_del_init(&inode->i_hash); list_del(&inode->i_list); @@ -667,15 +683,28 @@ void prune_icache(int goal) struct list_head *entry, *freeable = &list; int count; struct inode * inode; + int nr_to_scan = inodes_stat.nr_unused; +resume: spin_lock(&inode_lock); - count = 0; entry = inode_unused.prev; - while (entry != &inode_unused) - { + while (entry != &inode_unused && nr_to_scan--) { struct list_head *tmp = entry; + if (conditional_schedule_needed()) { + /* + * Need to drop the lock. Reposition + * the list head so we start here next time. + * This can corrupt the LRU nature of the + * unused list, but this isn't very important. + */ + list_del(&inode_unused); + list_add(&inode_unused, entry); + spin_unlock(&inode_lock); + unconditional_schedule(); + goto resume; + } entry = entry->prev; inode = INODE(tmp); if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK)) @@ -841,6 +870,8 @@ static struct inode * get_new_inode(stru inode = alloc_inode(); if (inode) { struct inode * old; + + conditional_schedule(); /* sync_old_buffers */ spin_lock(&inode_lock); /* We released the lock, so.. */ --- linux-2.4.18-pre6/fs/proc/array.c Thu Oct 11 09:00:01 2001 +++ linux-akpm/fs/proc/array.c Wed Jan 23 11:13:00 2002 @@ -412,9 +412,11 @@ static inline void statm_pte_range(pmd_t if (end > PMD_SIZE) end = PMD_SIZE; do { - pte_t page = *pte; + pte_t page; struct page *ptpage; + conditional_schedule(); /* For `top' and `ps' */ + page = *pte; address += PAGE_SIZE; pte++; if (pte_none(page)) --- linux-2.4.18-pre6/fs/proc/generic.c Fri Sep 7 10:53:59 2001 +++ linux-akpm/fs/proc/generic.c Wed Jan 23 11:13:00 2002 @@ -98,6 +98,8 @@ proc_file_read(struct file * file, char retval = n; break; } + + conditional_schedule(); /* Some /proc files are large */ /* This is a hack to allow mangling of file pos independent * of actual bytes read. Simply place the data at page, --- linux-2.4.18-pre6/fs/reiserfs/bitmap.c Fri Dec 21 11:19:22 2001 +++ linux-akpm/fs/reiserfs/bitmap.c Wed Jan 23 11:13:00 2002 @@ -412,17 +412,27 @@ free_and_return: } - reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; +/* this check needs to go before preparing the buffer because that can +** schedule when low-latency patches are in use. It is ok if the buffer +** is locked, preparing it will wait on it, and we handle the case where +** this block was allocated while we sleep below. +*/ + RFALSE( is_reusable (s, search_start, 0) == 0, + "vs-4140: bad block number found"); - RFALSE( buffer_locked (SB_AP_BITMAP (s)[i]) || - is_reusable (s, search_start, 0) == 0, - "vs-4140: bitmap block is locked or bad block number found"); + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; /* if this bit was already set, we've scheduled, and someone else ** has allocated it. loop around and try again */ if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) { reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ; + /* if this block has been allocated while we slept, it is + ** impossible to find any more contiguous blocks for ourselves. + ** If we are doing preallocation, give up now and return. + */ + if (for_prealloc) + goto free_and_return ; amount_needed++ ; continue ; } --- linux-2.4.18-pre6/fs/reiserfs/buffer2.c Fri Dec 21 11:19:22 2001 +++ linux-akpm/fs/reiserfs/buffer2.c Wed Jan 23 11:13:00 2002 @@ -55,6 +55,7 @@ struct buffer_head * reiserfs_bread (st PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); result = bread (super -> s_dev, n_block, n_size); + conditional_schedule(); PROC_INFO_INC( super, breads ); PROC_EXP( if( kstat.context_swtch != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); --- linux-2.4.18-pre6/fs/reiserfs/journal.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/fs/reiserfs/journal.c Wed Jan 23 11:13:00 2002 @@ -574,6 +574,7 @@ inline void insert_journal_hash(struct r /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { PROC_INFO_INC( p_s_sb, journal.lock_journal ); + conditional_schedule(); while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { PROC_INFO_INC( p_s_sb, journal.lock_journal_wait ); sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; @@ -704,6 +705,7 @@ reiserfs_panic(s, "journal-539: flush_co mark_buffer_dirty(tbh) ; } ll_rw_block(WRITE, 1, &tbh) ; + conditional_schedule(); count++ ; put_bh(tbh) ; /* once for our get_hash */ } @@ -833,6 +835,7 @@ static int _update_journal_header_block( set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ; ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + conditional_schedule(); if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { printk( "reiserfs: journal-837: IO error during journal replay\n" ); return -EIO ; @@ -2092,6 +2095,7 @@ static int journal_join(struct reiserfs_ } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + conditional_schedule(); return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; } @@ -2232,6 +2236,7 @@ int journal_mark_dirty_nolog(struct reis } int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + conditional_schedule(); return do_journal_end(th, p_s_sb, nblocks, 0) ; } @@ -2683,6 +2688,7 @@ void reiserfs_prepare_for_journal(struct RFALSE( buffer_locked(bh) && cur_tb != NULL, "waiting while do_balance was running\n") ; wait_on_buffer(bh) ; + conditional_schedule(); } PROC_INFO_INC( p_s_sb, journal.prepare_retry ); retry_count++ ; @@ -2856,6 +2862,7 @@ printk("journal-2020: do_journal_end: BA /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; + conditional_schedule(); /* getblk can sleep, so... */ tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), p_s_sb->s_blocksize) ; --- linux-2.4.18-pre6/fs/jbd/checkpoint.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/fs/jbd/checkpoint.c Wed Jan 23 11:13:00 2002 @@ -431,7 +431,11 @@ int __journal_clean_checkpoint_list(jour { transaction_t *transaction, *last_transaction, *next_transaction; int ret = 0; + int ll_retries = 4; /* lowlatency addition */ +restart: + if (ll_retries-- == 0) + goto out; transaction = journal->j_checkpoint_transactions; if (transaction == 0) goto out; @@ -451,6 +455,12 @@ int __journal_clean_checkpoint_list(jour jh = next_jh; next_jh = jh->b_cpnext; ret += __try_to_free_cp_buf(jh); + if (conditional_schedule_needed()) { + spin_unlock(&journal_datalist_lock); + unconditional_schedule(); + spin_lock(&journal_datalist_lock); + goto restart; + } } while (jh != last_jh); } } while (transaction != last_transaction); --- linux-2.4.18-pre6/fs/jbd/commit.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/fs/jbd/commit.c Wed Jan 23 11:13:00 2002 @@ -212,6 +212,16 @@ write_out_data_locked: __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); + if (conditional_schedule_needed()) { + if (commit_transaction->t_sync_datalist) + commit_transaction->t_sync_datalist = + next_jh; + if (bufs) + break; + spin_unlock(&journal_datalist_lock); + unconditional_schedule(); + goto write_out_data; + } } } if (bufs == ARRAY_SIZE(wbuf)) { @@ -235,8 +245,7 @@ write_out_data_locked: journal_brelse_array(wbuf, bufs); lock_journal(journal); spin_lock(&journal_datalist_lock); - if (bufs) - goto write_out_data_locked; + goto write_out_data_locked; } /* @@ -272,6 +281,14 @@ sync_datalist_empty: */ while ((jh = commit_transaction->t_async_datalist)) { struct buffer_head *bh = jh2bh(jh); + if (conditional_schedule_needed()) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + unconditional_schedule(); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; /* List may have changed */ + } if (buffer_locked(bh)) { spin_unlock(&journal_datalist_lock); unlock_journal(journal); @@ -486,6 +503,8 @@ start_journal_io: wait_for_iobuf: while (commit_transaction->t_iobuf_list != NULL) { struct buffer_head *bh; + + conditional_schedule(); jh = commit_transaction->t_iobuf_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { @@ -622,6 +641,8 @@ skip_commit: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + + conditional_schedule(); /* journal is locked */ jh = commit_transaction->t_forget; J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || --- linux-2.4.18-pre6/fs/ext3/balloc.c Thu Nov 22 23:02:58 2001 +++ linux-akpm/fs/ext3/balloc.c Wed Jan 23 11:13:00 2002 @@ -365,6 +365,9 @@ do_more: } } #endif + /* superblock lock is held, so this is safe */ + conditional_schedule(); + BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { ext3_error (sb, __FUNCTION__, --- linux-2.4.18-pre6/fs/ext3/inode.c Fri Dec 21 11:19:14 2001 +++ linux-akpm/fs/ext3/inode.c Wed Jan 23 11:13:00 2002 @@ -905,6 +905,8 @@ struct buffer_head *ext3_bread(handle_t prev_blocks = inode->i_blocks; + conditional_schedule(); /* Reading large directories */ + bh = ext3_getblk (handle, inode, block, create, err); if (!bh) return bh; @@ -1600,6 +1602,7 @@ ext3_clear_blocks(handle_t *handle, stru */ for (p = first; p < last; p++) { u32 nr = le32_to_cpu(*p); + conditional_schedule(); if (nr) { struct buffer_head *bh; @@ -1654,6 +1657,7 @@ static void ext3_free_data(handle_t *han } for (p = first; p < last; p++) { + conditional_schedule(); nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ @@ -1718,6 +1722,8 @@ static void ext3_free_branches(handle_t /* Go read the buffer for the next level down */ bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + + conditional_schedule(); /* * A read failure? Report error and clear slot --- linux-2.4.18-pre6/fs/ext3/namei.c Thu Nov 22 23:02:58 2001 +++ linux-akpm/fs/ext3/namei.c Wed Jan 23 11:13:00 2002 @@ -157,6 +157,7 @@ restart: if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); + conditional_schedule(); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ brelse(bh); --- linux-2.4.18-pre6/kernel/exit.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/kernel/exit.c Wed Jan 23 11:13:00 2002 @@ -203,6 +203,7 @@ static inline void close_files(struct fi } i++; set >>= 1; + conditional_schedule(); /* sys_exit, many files open */ } } } --- linux-2.4.18-pre6/kernel/ksyms.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/kernel/ksyms.c Wed Jan 23 11:13:00 2002 @@ -439,6 +439,13 @@ EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); EXPORT_SYMBOL(do_settimeofday); +#if LOWLATENCY_NEEDED +EXPORT_SYMBOL(set_running_and_schedule); +#ifdef CONFIG_LOLAT_SYSCTL +EXPORT_SYMBOL(__enable_lowlatency); +#endif +#endif + #if !defined(__ia64__) EXPORT_SYMBOL(loops_per_jiffy); #endif --- linux-2.4.18-pre6/kernel/module.c Thu Nov 22 23:02:59 2001 +++ linux-akpm/kernel/module.c Wed Jan 23 11:13:00 2002 @@ -1174,6 +1174,11 @@ static void *s_start(struct seq_file *m, return ERR_PTR(-ENOMEM); lock_kernel(); for (v = module_list, n = *pos; v; n -= v->nsyms, v = v->next) { +#if 0 + /* We can't actually do this, because we'd create a + * race against module unload. Need a semaphore. */ + conditional_schedule(); +#endif if (n < v->nsyms) { p->mod = v; p->index = n; --- linux-2.4.18-pre6/kernel/sched.c Fri Dec 21 11:19:23 2001 +++ linux-akpm/kernel/sched.c Wed Jan 23 11:13:00 2002 @@ -302,6 +302,17 @@ send_now_idle: if (tsk->processor != this_cpu) smp_send_reschedule(tsk->processor); } +#if LOWLATENCY_NEEDED + if (enable_lowlatency && (p->policy != SCHED_OTHER)) { + struct task_struct *t; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + t = cpu_curr(cpu); + if (t != tsk) + t->need_resched = 1; + } + } +#endif return; @@ -1342,3 +1353,93 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); } + +#if LOWLATENCY_NEEDED +#if LOWLATENCY_DEBUG + +static struct lolat_stats_t *lolat_stats_head; +static spinlock_t lolat_stats_lock = SPIN_LOCK_UNLOCKED; + +void set_running_and_schedule(struct lolat_stats_t *stats) +{ + spin_lock(&lolat_stats_lock); + if (stats->visited == 0) { + stats->visited = 1; + stats->next = lolat_stats_head; + lolat_stats_head = stats; + } + stats->count++; + spin_unlock(&lolat_stats_lock); + + if (current->state != TASK_RUNNING) + set_current_state(TASK_RUNNING); + schedule(); +} + +void show_lolat_stats(void) +{ + struct lolat_stats_t *stats = lolat_stats_head; + + printk("Low latency scheduling stats:\n"); + while (stats) { + printk("%s:%d: %lu\n", stats->file, stats->line, stats->count); + stats->count = 0; + stats = stats->next; + } +} + +#else /* LOWLATENCY_DEBUG */ + +void set_running_and_schedule() +{ + if (current->state != TASK_RUNNING) + __set_current_state(TASK_RUNNING); + schedule(); +} + +#endif /* LOWLATENCY_DEBUG */ + +int ll_copy_to_user(void *to_user, const void *from, unsigned long len) +{ + while (len) { + unsigned long n_to_copy = len; + unsigned long remainder; + + if (n_to_copy > 4096) + n_to_copy = 4096; + remainder = copy_to_user(to_user, from, n_to_copy); + if (remainder) + return remainder + len; + to_user = ((char *)to_user) + n_to_copy; + from = ((char *)from) + n_to_copy; + len -= n_to_copy; + conditional_schedule(); + } + return 0; +} + +int ll_copy_from_user(void *to, const void *from_user, unsigned long len) +{ + while (len) { + unsigned long n_to_copy = len; + unsigned long remainder; + + if (n_to_copy > 4096) + n_to_copy = 4096; + remainder = copy_from_user(to, from_user, n_to_copy); + if (remainder) + return remainder + len; + to = ((char *)to) + n_to_copy; + from_user = ((char *)from_user) + n_to_copy; + len -= n_to_copy; + conditional_schedule(); + } + return 0; +} + +#ifdef CONFIG_LOLAT_SYSCTL +struct low_latency_enable_struct __enable_lowlatency = { 0, }; +#endif + +#endif /* LOWLATENCY_NEEDED */ + --- linux-2.4.18-pre6/kernel/sysctl.c Fri Dec 21 11:19:23 2001 +++ linux-akpm/kernel/sysctl.c Wed Jan 23 11:13:00 2002 @@ -256,6 +256,10 @@ static ctl_table kern_table[] = { {KERN_S390_USER_DEBUG_LOGGING,"userprocess_debug", &sysctl_userprocess_debug,sizeof(int),0644,NULL,&proc_dointvec}, #endif +#ifdef CONFIG_LOLAT_SYSCTL + {KERN_LOWLATENCY, "lowlatency", &enable_lowlatency, sizeof (int), + 0644, NULL, &proc_dointvec}, +#endif {0} }; --- linux-2.4.18-pre6/mm/filemap.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/mm/filemap.c Wed Jan 23 11:13:00 2002 @@ -187,6 +187,18 @@ void invalidate_inode_pages(struct inode page = list_entry(curr, struct page, list); curr = curr->next; + if (conditional_schedule_needed() && !TryLockPage(page)) { + /* + * Page is locked. We can drop the spinlocks + */ + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + unconditional_schedule(); + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + UnlockPage(page); + } + /* We cannot invalidate something in dirty.. */ if (PageDirty(page)) continue; @@ -256,12 +268,24 @@ static int truncate_list_pages(struct li struct list_head *curr; struct page * page; int unlocked = 0; + int restart_count = 4; restart: curr = head->prev; while (curr != head) { unsigned long offset; + if (conditional_schedule_needed() && restart_count) { + restart_count--; + list_del(head); + list_add(head, curr); /* Restart on this page */ + spin_unlock(&pagecache_lock); + unconditional_schedule(); + spin_lock(&pagecache_lock); + unlocked = 1; + goto restart; + } + page = list_entry(curr, struct page, list); offset = page->index; @@ -294,13 +318,11 @@ static int truncate_list_pages(struct li } else wait_on_page(page); - page_cache_release(page); - - if (current->need_resched) { - __set_current_state(TASK_RUNNING); - schedule(); + if (LOWLATENCY_NEEDED) { + restart_count = 4; /* We made progress */ } + page_cache_release(page); spin_lock(&pagecache_lock); goto restart; } @@ -509,6 +531,7 @@ static int do_buffer_fdatasync(struct li page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_msync() (only used by minixfs, udf) */ lock_page(page); /* The buffers could have been free'd while we waited for the page lock */ @@ -595,12 +618,14 @@ int filemap_fdatasync(struct address_spa list_del(&page->list); list_add(&page->list, &mapping->locked_pages); - if (!PageDirty(page)) - continue; - page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_msync() */ + + if (!PageDirty(page)) + goto clean; + lock_page(page); if (PageDirty(page)) { @@ -611,7 +636,7 @@ int filemap_fdatasync(struct address_spa ret = err; } else UnlockPage(page); - +clean: page_cache_release(page); spin_lock(&pagecache_lock); } @@ -629,7 +654,8 @@ int filemap_fdatasync(struct address_spa int filemap_fdatawait(struct address_space * mapping) { int ret = 0; - + DEFINE_RESCHED_COUNT; +restart: spin_lock(&pagecache_lock); while (!list_empty(&mapping->locked_pages)) { @@ -638,6 +664,17 @@ int filemap_fdatawait(struct address_spa list_del(&page->list); list_add(&page->list, &mapping->clean_pages); + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + page_cache_get(page); + spin_unlock(&pagecache_lock); + unconditional_schedule(); + page_cache_release(page); + goto restart; + } + } + if (!PageLocked(page)) continue; @@ -738,8 +775,10 @@ static int page_cache_read(struct file * spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash); spin_unlock(&pagecache_lock); - if (page) + if (page) { + conditional_schedule(); return 0; + } page = page_cache_alloc(mapping); if (!page) @@ -904,6 +943,11 @@ static struct page * __find_lock_page_he * the hash-list needs a held write-lock. */ repeat: + if (conditional_schedule_needed()) { + spin_unlock(&pagecache_lock); + unconditional_schedule(); + spin_lock(&pagecache_lock); + } page = __find_page_nolock(mapping, offset, hash); if (page) { page_cache_get(page); @@ -1368,6 +1412,8 @@ found_page: page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_read() */ + if (!Page_Uptodate(page)) goto page_not_up_to_date; generic_file_readahead(reada_ok, filp, inode, page); @@ -2066,6 +2112,12 @@ static inline int filemap_sync_pte_range address += PAGE_SIZE; pte++; } while (address && (address < end)); + + if (conditional_schedule_needed()) { + spin_unlock(&vma->vm_mm->page_table_lock); + unconditional_schedule(); /* syncing large mapped files */ + spin_lock(&vma->vm_mm->page_table_lock); + } return error; } @@ -2473,7 +2525,9 @@ static long madvise_dontneed(struct vm_a if (vma->vm_flags & VM_LOCKED) return -EINVAL; - zap_page_range(vma->vm_mm, start, end - start); + zap_page_range(vma->vm_mm, start, end - start, + ZPR_COND_RESCHED); /* sys_madvise(MADV_DONTNEED) */ + return 0; } @@ -3038,6 +3092,9 @@ generic_file_write(struct file *file,con goto sync_failure; page_fault = __copy_from_user(kaddr+offset, buf, bytes); flush_dcache_page(page); + + conditional_schedule(); + status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); if (page_fault) goto fail_write; --- linux-2.4.18-pre6/mm/memory.c Fri Dec 21 11:19:23 2001 +++ linux-akpm/mm/memory.c Wed Jan 23 11:13:00 2002 @@ -355,7 +355,7 @@ static inline int zap_pmd_range(mmu_gath /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +static void do_zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) { mmu_gather_t *tlb; pgd_t * dir; @@ -465,6 +465,10 @@ int get_user_pages(struct task_struct *t struct page *map; while (!(map = follow_page(mm, start, write))) { spin_unlock(&mm->page_table_lock); + + /* Pinning down many physical pages (kiobufs, mlockall) */ + conditional_schedule(); + switch (handle_mm_fault(mm, vma, start, write)) { case 1: tsk->min_flt++; @@ -609,6 +613,21 @@ void unmap_kiobuf (struct kiobuf *iobuf) iobuf->locked = 0; } +#define MAX_ZAP_BYTES 256*PAGE_SIZE + +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions) +{ + while (size) { + unsigned long chunk = size; + if (actions & ZPR_COND_RESCHED && chunk > MAX_ZAP_BYTES) + chunk = MAX_ZAP_BYTES; + do_zap_page_range(mm, address, chunk); + if (actions & ZPR_COND_RESCHED) + conditional_schedule(); + address += chunk; + size -= chunk; + } +} /* * Lock down all of the pages of a kiovec for IO. @@ -718,11 +737,18 @@ int unlock_kiovec(int nr, struct kiobuf return 0; } -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte, + unsigned long address, unsigned long size, + pgprot_t prot) { unsigned long end; + if (conditional_schedule_needed()) { + spin_unlock(&mm->page_table_lock); + unconditional_schedule(); /* mmap(/dev/zero) */ + spin_lock(&mm->page_table_lock); + } + address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -750,7 +776,7 @@ static inline int zeromap_pmd_range(stru pte_t * pte = pte_alloc(mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(mm, pte, address, end - address, prot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -984,7 +1010,7 @@ static void vmtruncate_list(struct vm_ar /* mapping wholly truncated? */ if (mpnt->vm_pgoff >= pgoff) { - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, 0); continue; } @@ -997,7 +1023,7 @@ static void vmtruncate_list(struct vm_ar /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, 0); } while ((mpnt = mpnt->vm_next_share) != NULL); } --- linux-2.4.18-pre6/mm/mmap.c Mon Nov 5 21:01:12 2001 +++ linux-akpm/mm/mmap.c Wed Jan 23 11:13:00 2002 @@ -569,7 +569,7 @@ unmap_and_free_vma: fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, 0); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -967,7 +967,7 @@ int do_munmap(struct mm_struct *mm, unsi remove_shared_vm_struct(mpnt); mm->map_count--; - zap_page_range(mm, st, size); + zap_page_range(mm, st, size, ZPR_COND_RESCHED); /* sys_munmap() */ /* * Fix the mapping, and free the old area if it wasn't reused. @@ -1127,7 +1127,7 @@ void exit_mmap(struct mm_struct * mm) } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + zap_page_range(mm, start, size, ZPR_COND_RESCHED); /* sys_exit() */ if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); --- linux-2.4.18-pre6/mm/mremap.c Thu Sep 20 20:31:26 2001 +++ linux-akpm/mm/mremap.c Wed Jan 23 11:13:00 2002 @@ -118,7 +118,7 @@ oops_we_failed: flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); + zap_page_range(mm, new_addr, len, 0); return -1; } --- linux-2.4.18-pre6/mm/slab.c Fri Dec 21 11:19:23 2001 +++ linux-akpm/mm/slab.c Wed Jan 23 11:13:00 2002 @@ -937,6 +937,7 @@ static int __kmem_cache_shrink(kmem_cach spin_unlock_irq(&cachep->spinlock); kmem_slab_destroy(cachep, slabp); + conditional_schedule(); /* Can take 30 milliseconds */ spin_lock_irq(&cachep->spinlock); } ret = !list_empty(&cachep->slabs_full) || !list_empty(&cachep->slabs_partial); @@ -1821,6 +1822,7 @@ perfect: */ spin_unlock_irq(&best_cachep->spinlock); kmem_slab_destroy(best_cachep, slabp); + conditional_schedule(); /* try_to_free_pages() */ spin_lock_irq(&best_cachep->spinlock); } spin_unlock_irq(&best_cachep->spinlock); --- linux-2.4.18-pre6/mm/swapfile.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/mm/swapfile.c Wed Jan 23 11:13:00 2002 @@ -823,7 +823,7 @@ int get_swaparea_info(char *buf) len += sprintf(buf + len, "partition\t"); usedswap = 0; - for (j = 0; j < ptr->max; ++j) + for (j = 0; j < ptr->max; ++j) { switch (ptr->swap_map[j]) { case SWAP_MAP_BAD: case 0: @@ -831,6 +831,8 @@ int get_swaparea_info(char *buf) default: usedswap++; } + conditional_schedule(); + } len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), usedswap << (PAGE_SHIFT - 10), ptr->prio); } @@ -1124,6 +1126,11 @@ void si_swapinfo(struct sysinfo *val) if (swap_info[i].flags != SWP_USED) continue; for (j = 0; j < swap_info[i].max; ++j) { + if (conditional_schedule_needed()) { + swap_list_unlock(); + conditional_schedule(); + swap_list_lock(); + } switch (swap_info[i].swap_map[j]) { case 0: case SWAP_MAP_BAD: --- linux-2.4.18-pre6/mm/vmscan.c Tue Jan 22 12:38:31 2002 +++ linux-akpm/mm/vmscan.c Wed Jan 23 11:13:00 2002 @@ -157,6 +157,7 @@ static inline int swap_out_pmd(struct mm { pte_t * pte; unsigned long pmd_end; + DEFINE_RESCHED_COUNT; if (pmd_none(*dir)) return count; @@ -182,11 +183,17 @@ static inline int swap_out_pmd(struct mm address += PAGE_SIZE; break; } + if (TEST_RESCHED_COUNT(4)) { + if (conditional_schedule_needed()) + goto out; + RESET_RESCHED_COUNT(); + } } } address += PAGE_SIZE; pte++; } while (address && (address < end)); +out: mm->swap_address = address; return count; } @@ -215,6 +222,8 @@ static inline int swap_out_pgd(struct mm count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); if (!count) break; + if (conditional_schedule_needed()) + return count; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -240,6 +249,8 @@ static inline int swap_out_vma(struct mm count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); if (!count) break; + if (conditional_schedule_needed()) + return count; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); @@ -261,6 +272,7 @@ static inline int swap_out_mm(struct mm_ * Find the proper vm-area after freezing the vma chain * and ptes. */ +continue_scan: spin_lock(&mm->page_table_lock); address = mm->swap_address; if (address == TASK_SIZE || swap_mm != mm) { @@ -278,6 +290,12 @@ static inline int swap_out_mm(struct mm_ vma = vma->vm_next; if (!vma) break; + if (conditional_schedule_needed()) { /* Scanning a large vma */ + spin_unlock(&mm->page_table_lock); + unconditional_schedule(); + /* Continue from where we left off */ + goto continue_scan; + } if (!count) goto out_unlock; address = vma->vm_start; --- linux-2.4.18-pre6/net/core/iovec.c Mon Sep 10 07:57:00 2001 +++ linux-akpm/net/core/iovec.c Wed Jan 23 11:13:00 2002 @@ -88,7 +88,7 @@ int memcpy_toiovec(struct iovec *iov, un if(iov->iov_len) { int copy = min_t(unsigned int, iov->iov_len, len); - if (copy_to_user(iov->iov_base, kdata, copy)) + if (ll_copy_to_user(iov->iov_base, kdata, copy)) goto out; kdata+=copy; len-=copy; --- linux-2.4.18-pre6/net/ipv4/tcp_minisocks.c Mon Oct 1 09:19:57 2001 +++ linux-akpm/net/ipv4/tcp_minisocks.c Wed Jan 23 11:13:00 2002 @@ -434,6 +434,9 @@ static void SMP_TIMER_NAME(tcp_twkill)(u { struct tcp_tw_bucket *tw; int killed = 0; +#if LOWLATENCY_NEEDED + int max_killed = 0; +#endif /* NOTE: compare this to previous version where lock * was released after detaching chain. It was racy, @@ -447,6 +450,13 @@ static void SMP_TIMER_NAME(tcp_twkill)(u goto out; while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { +#if LOWLATENCY_NEEDED + /* This loop takes ~6 usecs per iteration. */ + if (killed > 100) { + max_killed = 1; + break; + } +#endif tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; tw->pprev_death = NULL; spin_unlock(&tw_death_lock); @@ -457,12 +467,24 @@ static void SMP_TIMER_NAME(tcp_twkill)(u killed++; spin_lock(&tw_death_lock); + + } + +#if LOWLATENCY_NEEDED + if (max_killed) { /* More to do: do it soon */ + mod_timer(&tcp_tw_timer, jiffies+2); + tcp_tw_count -= killed; + } + else +#endif + { + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + + if ((tcp_tw_count -= killed) != 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - if ((tcp_tw_count -= killed) != 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); net_statistics[smp_processor_id()*2].TimeWaited += killed; out: spin_unlock(&tw_death_lock);