--- linux-2.4.18-pre6/fs/reiserfs/stree.c	Fri Dec 21 11:19:23 2001
+++ linux-akpm/fs/reiserfs/stree.c	Wed Jan 23 11:13:00 2002
@@ -648,9 +648,8 @@ int search_by_key (struct super_block * 
                                        stop at leaf level - set to
                                        DISK_LEAF_NODE_LEVEL */
     ) {
-    int  n_block_number = SB_ROOT_BLOCK (p_s_sb),
-      expected_level = SB_TREE_HEIGHT (p_s_sb),
-      n_block_size    = p_s_sb->s_blocksize;
+    int n_block_number, expected_level;
+    int n_block_size    = p_s_sb->s_blocksize;
     struct buffer_head  *       p_s_bh;
     struct path_element *       p_s_last_element;
     int				n_node_level, n_retval;
@@ -662,7 +661,8 @@ int search_by_key (struct super_block * 
 #endif
     
     PROC_INFO_INC( p_s_sb, search_by_key );
-    
+    conditional_schedule();
+
     /* As we add each node to a path we increase its count.  This means that
        we must be careful to release all nodes in a path before we either
        discard the path struct or re-use the path struct, as we do here. */
@@ -674,6 +674,8 @@ int search_by_key (struct super_block * 
     /* With each iteration of this loop we search through the items in the
        current node, and calculate the next current node(next path element)
        for the next iteration of this loop.. */
+    n_block_number = SB_ROOT_BLOCK (p_s_sb);
+    expected_level = SB_TREE_HEIGHT (p_s_sb);
     while ( 1 ) {
 
 #ifdef CONFIG_REISERFS_CHECK
@@ -1099,6 +1101,8 @@ static char  prepare_for_delete_or_cut(
 
 	    for (n_counter = *p_n_removed;
 		 n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) {
+
+		conditional_schedule();
 
 		if (item_moved (&s_ih, p_s_path)) {
 		    need_research = 1 ;
--- linux-2.4.18-pre6/include/linux/low-latency.h	Thu Jan  1 00:00:00 1970
+++ linux-akpm/include/linux/low-latency.h	Wed Jan 23 11:13:00 2002
@@ -0,0 +1,109 @@
+/*
+ * include/linux/low-latency.h
+ *
+ * Andrew Morton <akpm@zip.com.au>
+ */
+
+#ifndef LOW_LATENCY_H_INCLUDED
+#define LOW_LATENCY_H_INCLUDED
+
+#if defined(CONFIG_LOLAT)
+#define LOWLATENCY_NEEDED	1
+#else
+#define LOWLATENCY_NEEDED	0
+#endif
+
+#if LOWLATENCY_NEEDED
+
+#include <linux/cache.h>		/* For ____cacheline_aligned */
+
+#ifdef CONFIG_LOLAT_SYSCTL
+extern struct low_latency_enable_struct {
+	int yep;
+} ____cacheline_aligned __enable_lowlatency;
+#define enable_lowlatency __enable_lowlatency.yep
+
+#else
+#define enable_lowlatency 1
+#endif
+
+/*
+ * Set this non-zero to generate low-latency instrumentation
+ */
+#define LOWLATENCY_DEBUG		0
+
+/*
+ * Set this non-zero for robustness testing
+ */
+#define LOWLATENCY_ALWAYS_SCHEDULE	0
+
+#if LOWLATENCY_DEBUG
+
+#if LOWLATENCY_ALWAYS_SCHEDULE
+#define conditional_schedule_needed() ((enable_lowlatency == 2) || (enable_lowlatency && current->need_resched))
+#else
+#define conditional_schedule_needed() (enable_lowlatency && current->need_resched)
+#endif
+
+struct lolat_stats_t {
+	unsigned long count;
+	int visited;
+	const char *file;
+	int line;
+	struct lolat_stats_t *next;
+};
+
+void set_running_and_schedule(struct lolat_stats_t *stats);
+
+#define unconditional_schedule()					\
+	do {								\
+		static struct lolat_stats_t stats = {			\
+			file: __FILE__,					\
+			line: __LINE__,					\
+		};							\
+		set_running_and_schedule(&stats);			\
+	} while (0)
+
+extern void show_lolat_stats(void);
+
+#else	/* LOWLATENCY_DEBUG */
+
+#if LOWLATENCY_ALWAYS_SCHEDULE
+#define conditional_schedule_needed() 1
+#else
+#define conditional_schedule_needed() (current->need_resched)
+#endif
+
+void set_running_and_schedule(void);
+#define unconditional_schedule() set_running_and_schedule()
+
+#endif	/* LOWLATENCY_DEBUG */
+
+#define conditional_schedule()						\
+	do {								\
+		if (conditional_schedule_needed())			\
+			unconditional_schedule();			\
+	} while (0)
+
+#define DEFINE_RESCHED_COUNT	int resched_count = 0
+#define TEST_RESCHED_COUNT(n)	(enable_lowlatency && (++resched_count > (n)))
+#define RESET_RESCHED_COUNT()	resched_count = 0
+extern int ll_copy_to_user(void *to_user, const void *from, unsigned long len);
+extern int ll_copy_from_user(void *to, const void *from_user, unsigned long len);
+
+#else	/* LOWLATENCY_NEEDED */
+
+#define conditional_schedule_needed() 0
+#define conditional_schedule()
+#define unconditional_schedule()
+
+#define DEFINE_RESCHED_COUNT
+#define TEST_RESCHED_COUNT(n)	0
+#define RESET_RESCHED_COUNT()
+#define ll_copy_to_user(to_user, from, len) copy_to_user((to_user), (from), (len))
+#define ll_copy_from_user(to, from_user, len) copy_from_user((to), (from_user), (len))
+
+#endif	/* LOWLATENCY_NEEDED */
+
+#endif /* LOW_LATENCY_H_INCLUDED */
+
--- linux-2.4.18-pre6/include/linux/mm.h	Fri Dec 21 11:19:23 2001
+++ linux-akpm/include/linux/mm.h	Wed Jan 23 11:14:34 2002
@@ -121,6 +121,8 @@ extern int vm_max_readahead;
  */
 extern pgprot_t protection_map[16];
 
+/* Actions for zap_page_range() */
+#define ZPR_COND_RESCHED	1	/* Do a conditional_schedule() occasionally */
 
 /*
  * These are the virtual MM functions - opening of an area, closing and
@@ -404,7 +406,7 @@ struct file *shmem_file_setup(char * nam
 extern void shmem_lock(struct file * file, int lock);
 extern int shmem_zero_setup(struct vm_area_struct *);
 
-extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
+extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions);
 extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
 extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
 extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
--- linux-2.4.18-pre6/include/linux/reiserfs_fs.h	Fri Dec 21 11:19:23 2001
+++ linux-akpm/include/linux/reiserfs_fs.h	Wed Jan 23 11:16:40 2002
@@ -1155,8 +1155,8 @@ static inline loff_t max_reiserfs_offset
 #define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter)
 #define get_generation(s) atomic_read (&fs_generation(s))
 #define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
-#define fs_changed(gen,s) (gen != get_generation (s))
-
+#define __fs_changed(gen,s) (gen != get_generation (s))
+#define fs_changed(gen,s) ({conditional_schedule(); __fs_changed(gen,s);})
 
 /***************************************************************************/
 /*                  FIXATE NODES                                           */
--- linux-2.4.18-pre6/include/linux/sched.h	Fri Dec 21 11:19:23 2001
+++ linux-akpm/include/linux/sched.h	Wed Jan 23 11:14:33 2002
@@ -26,6 +26,7 @@ extern unsigned long event;
 #include <linux/signal.h>
 #include <linux/securebits.h>
 #include <linux/fs_struct.h>
+#include <linux/low-latency.h>
 
 struct exec_domain;
 
--- linux-2.4.18-pre6/include/linux/sysctl.h	Mon Nov 26 11:52:07 2001
+++ linux-akpm/include/linux/sysctl.h	Wed Jan 23 11:14:33 2002
@@ -124,6 +124,7 @@ enum
 	KERN_CORE_USES_PID=52,		/* int: use core or core.%pid */
 	KERN_TAINTED=53,	/* int: various kernel tainted flags */
 	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
+	KERN_LOWLATENCY=55,     /* int: enable low latency scheduling */
 };
 
 
--- linux-2.4.18-pre6/arch/i386/config.in	Fri Dec 21 11:19:13 2001
+++ linux-akpm/arch/i386/config.in	Wed Jan 23 11:13:00 2002
@@ -26,6 +26,9 @@ endmenu
 
 mainmenu_option next_comment
 comment 'Processor type and features'
+bool 'Low latency scheduling' CONFIG_LOLAT
+dep_bool 'Control low latency with sysctl' CONFIG_LOLAT_SYSCTL $CONFIG_LOLAT
+
 choice 'Processor family' \
 	"386					CONFIG_M386 \
 	 486					CONFIG_M486 \
--- linux-2.4.18-pre6/drivers/block/ll_rw_blk.c	Tue Jan 22 12:38:29 2002
+++ linux-akpm/drivers/block/ll_rw_blk.c	Wed Jan 23 11:13:00 2002
@@ -917,6 +917,7 @@ void submit_bh(int rw, struct buffer_hea
 			kstat.pgpgin += count;
 			break;
 	}
+	conditional_schedule();
 }
 
 /**
--- linux-2.4.18-pre6/drivers/char/mem.c	Fri Dec 21 11:19:13 2001
+++ linux-akpm/drivers/char/mem.c	Wed Jan 23 11:13:00 2002
@@ -400,7 +400,7 @@ static inline size_t read_zero_pagealign
 		if (count > size)
 			count = size;
 
-		zap_page_range(mm, addr, count);
+		zap_page_range(mm, addr, count, 0);
         	zeromap_page_range(addr, count, PAGE_COPY);
 
 		size -= count;
--- linux-2.4.18-pre6/drivers/char/random.c	Thu Nov 22 23:02:57 2001
+++ linux-akpm/drivers/char/random.c	Wed Jan 23 11:13:00 2002
@@ -1369,6 +1369,11 @@ static ssize_t extract_entropy(struct en
 		buf += i;
 		ret += i;
 		add_timer_randomness(&extract_timer_state, nbytes);
+#if LOWLATENCY_NEEDED
+		/* This can happen in softirq's, but that's what we want */
+		if (conditional_schedule_needed())
+			break;
+#endif
 	}
 
 	/* Wipe data just returned from memory */
--- linux-2.4.18-pre6/fs/buffer.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/fs/buffer.c	Wed Jan 23 11:13:00 2002
@@ -204,8 +204,10 @@ static int write_some_buffers(kdev_t dev
 
 		if (dev && bh->b_dev != dev)
 			continue;
-		if (test_and_set_bit(BH_Lock, &bh->b_state))
+		if (test_and_set_bit(BH_Lock, &bh->b_state)) {
+			__refile_buffer(bh);
 			continue;
+		}
 		if (atomic_set_buffer_clean(bh)) {
 			__refile_buffer(bh);
 			get_bh(bh);
@@ -215,6 +217,7 @@ static int write_some_buffers(kdev_t dev
 
 			spin_unlock(&lru_list_lock);
 			write_locked_buffers(array, count);
+			conditional_schedule();
 			return -EAGAIN;
 		}
 		unlock_buffer(bh);
@@ -249,12 +252,19 @@ static int wait_for_buffers(kdev_t dev, 
 	struct buffer_head * next;
 	int nr;
 
-	next = lru_list[index];
 	nr = nr_buffers_type[index];
+repeat:
+	next = lru_list[index];
 	while (next && --nr >= 0) {
 		struct buffer_head *bh = next;
 		next = bh->b_next_free;
 
+		if (conditional_schedule_needed()) {
+			spin_unlock(&lru_list_lock);
+			unconditional_schedule();
+			spin_lock(&lru_list_lock);
+			goto repeat;
+		}
 		if (!buffer_locked(bh)) {
 			if (refile)
 				__refile_buffer(bh);
@@ -262,7 +272,11 @@ static int wait_for_buffers(kdev_t dev, 
 		}
 		if (dev && bh->b_dev != dev)
 			continue;
-
+		if (conditional_schedule_needed()) {
+			spin_unlock(&lru_list_lock);
+			unconditional_schedule();
+			return -EAGAIN;
+		}
 		get_bh(bh);
 		spin_unlock(&lru_list_lock);
 		wait_on_buffer (bh);
@@ -273,7 +287,7 @@ static int wait_for_buffers(kdev_t dev, 
 	return 0;
 }
 
-static inline void wait_for_some_buffers(kdev_t dev)
+static void wait_for_some_buffers(kdev_t dev)
 {
 	spin_lock(&lru_list_lock);
 	wait_for_buffers(dev, BUF_LOCKED, 1);
@@ -301,6 +315,15 @@ int sync_buffers(kdev_t dev, int wait)
 {
 	int err = 0;
 
+#if LOWLATENCY_NEEDED
+	/*
+	 * syncing devA when there are lots of buffers dirty against
+	 * devB is expensive.
+	 */
+	if (enable_lowlatency)
+		dev = NODEV;
+#endif
+
 	/* One pass for no-wait, three for wait:
 	 * 0) write out all dirty, unlocked buffers;
 	 * 1) wait for all dirty locked buffers;
@@ -682,6 +705,16 @@ void invalidate_bdev(struct block_device
 			/* Not hashed? */
 			if (!bh->b_pprev)
 				continue;
+
+			if (conditional_schedule_needed()) {
+				get_bh(bh);
+				spin_unlock(&lru_list_lock);
+				unconditional_schedule();
+				spin_lock(&lru_list_lock);
+				put_bh(bh);
+				slept = 1;
+			}
+
 			if (buffer_locked(bh)) {
 				get_bh(bh);
 				spin_unlock(&lru_list_lock);
@@ -833,7 +866,8 @@ int fsync_inode_buffers(struct inode *in
 	struct buffer_head *bh;
 	struct inode tmp;
 	int err = 0, err2;
-	
+	DEFINE_RESCHED_COUNT;
+
 	INIT_LIST_HEAD(&tmp.i_dirty_buffers);
 	
 	spin_lock(&lru_list_lock);
@@ -854,8 +888,18 @@ int fsync_inode_buffers(struct inode *in
 				spin_lock(&lru_list_lock);
 			}
 		}
+		if (TEST_RESCHED_COUNT(32)) {
+			RESET_RESCHED_COUNT();
+			if (conditional_schedule_needed()) {
+				spin_unlock(&lru_list_lock);
+				unconditional_schedule();	/* Syncing many dirty buffers */
+				spin_lock(&lru_list_lock);
+			}
+		}
 	}
 
+	RESET_RESCHED_COUNT();
+
 	while (!list_empty(&tmp.i_dirty_buffers)) {
 		bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
 		remove_inode_queue(bh);
@@ -882,12 +926,18 @@ int fsync_inode_data_buffers(struct inod
 	struct buffer_head *bh;
 	struct inode tmp;
 	int err = 0, err2;
-	
+
 	INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
-	
+
+repeat:
 	spin_lock(&lru_list_lock);
 
 	while (!list_empty(&inode->i_dirty_data_buffers)) {
+		if (conditional_schedule_needed()) {
+			spin_unlock(&lru_list_lock);
+			unconditional_schedule();
+			goto repeat;
+		}
 		bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
 		list_del(&bh->b_inode_buffers);
 		if (!buffer_dirty(bh) && !buffer_locked(bh))
@@ -914,6 +964,7 @@ int fsync_inode_data_buffers(struct inod
 		if (!buffer_uptodate(bh))
 			err = -EIO;
 		brelse(bh);
+		conditional_schedule();
 		spin_lock(&lru_list_lock);
 	}
 	
@@ -942,14 +993,23 @@ int osync_inode_buffers(struct inode *in
 	struct buffer_head *bh;
 	struct list_head *list;
 	int err = 0;
+	DEFINE_RESCHED_COUNT;
 
+repeat:
+	conditional_schedule();
 	spin_lock(&lru_list_lock);
 	
- repeat:
-	
 	for (list = inode->i_dirty_buffers.prev; 
 	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
 	     list = bh->b_inode_buffers.prev) {
+		if (TEST_RESCHED_COUNT(32)) {
+			RESET_RESCHED_COUNT();
+			if (conditional_schedule_needed()) {
+				spin_unlock(&lru_list_lock);
+				goto repeat;
+			}
+		}
+				
 		if (buffer_locked(bh)) {
 			get_bh(bh);
 			spin_unlock(&lru_list_lock);
@@ -957,7 +1017,6 @@ int osync_inode_buffers(struct inode *in
 			if (!buffer_uptodate(bh))
 				err = -EIO;
 			brelse(bh);
-			spin_lock(&lru_list_lock);
 			goto repeat;
 		}
 	}
@@ -971,14 +1030,23 @@ int osync_inode_data_buffers(struct inod
 	struct buffer_head *bh;
 	struct list_head *list;
 	int err = 0;
+	DEFINE_RESCHED_COUNT;
 
+repeat:
+	conditional_schedule();
 	spin_lock(&lru_list_lock);
-	
- repeat:
 
 	for (list = inode->i_dirty_data_buffers.prev; 
 	     bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
 	     list = bh->b_inode_buffers.prev) {
+		if (TEST_RESCHED_COUNT(32)) {
+			RESET_RESCHED_COUNT();
+			if (conditional_schedule_needed()) {
+				spin_unlock(&lru_list_lock);
+				goto repeat;
+			}
+		}
+				
 		if (buffer_locked(bh)) {
 			get_bh(bh);
 			spin_unlock(&lru_list_lock);
@@ -986,7 +1054,6 @@ int osync_inode_data_buffers(struct inod
 			if (!buffer_uptodate(bh))
 				err = -EIO;
 			brelse(bh);
-			spin_lock(&lru_list_lock);
 			goto repeat;
 		}
 	}
@@ -1004,12 +1071,24 @@ int osync_inode_data_buffers(struct inod
 void invalidate_inode_buffers(struct inode *inode)
 {
 	struct list_head * entry;
-	
+
+repeat:
+	conditional_schedule();	
 	spin_lock(&lru_list_lock);
-	while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
+	while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) {
+		if (conditional_schedule_needed()) {
+			spin_unlock(&lru_list_lock);
+			goto repeat;
+		}
 		remove_inode_queue(BH_ENTRY(entry));
-	while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
+	}
+	while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) {
+		if (conditional_schedule_needed()) {
+			spin_unlock(&lru_list_lock);
+			goto repeat;
+		}
 		remove_inode_queue(BH_ENTRY(entry));
+	}
 	spin_unlock(&lru_list_lock);
 }
 
@@ -1184,8 +1263,10 @@ struct buffer_head * bread(kdev_t dev, i
 
 	bh = getblk(dev, block, size);
 	touch_buffer(bh);
-	if (buffer_uptodate(bh))
+	if (buffer_uptodate(bh)) {
+		conditional_schedule();
 		return bh;
+	}
 	ll_rw_block(READ, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
@@ -2780,7 +2861,7 @@ void __init buffer_init(unsigned long me
 
 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
 
-void wakeup_bdflush(void)
+void wakeup_bdflush(void) 
 {
 	wake_up_interruptible(&bdflush_wait);
 }
--- linux-2.4.18-pre6/fs/dcache.c	Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/dcache.c	Wed Jan 23 11:13:00 2002
@@ -320,11 +320,23 @@ static inline void prune_one_dentry(stru
  
 void prune_dcache(int count)
 {
+	DEFINE_RESCHED_COUNT;
+
+redo:
 	spin_lock(&dcache_lock);
 	for (;;) {
 		struct dentry *dentry;
 		struct list_head *tmp;
 
+		if (TEST_RESCHED_COUNT(100)) {
+			RESET_RESCHED_COUNT();
+			if (conditional_schedule_needed()) {
+				spin_unlock(&dcache_lock);
+				unconditional_schedule();
+				goto redo;
+			}
+		}
+
 		tmp = dentry_unused.prev;
 
 		if (tmp == &dentry_unused)
@@ -479,6 +491,7 @@ static int select_parent(struct dentry *
 	struct dentry *this_parent = parent;
 	struct list_head *next;
 	int found = 0;
+	DEFINE_RESCHED_COUNT;
 
 	spin_lock(&dcache_lock);
 repeat:
@@ -493,6 +506,13 @@ resume:
 			list_add(&dentry->d_lru, dentry_unused.prev);
 			found++;
 		}
+
+		if (TEST_RESCHED_COUNT(500) && found > 10) {
+			if (conditional_schedule_needed())	/* Typically sys_rmdir() */
+				goto out;
+			RESET_RESCHED_COUNT();
+		}
+
 		/*
 		 * Descend a level if the d_subdirs list is non-empty.
 		 */
@@ -517,6 +537,7 @@ this_parent->d_parent->d_name.name, this
 #endif
 		goto resume;
 	}
+out:
 	spin_unlock(&dcache_lock);
 	return found;
 }
@@ -532,8 +553,10 @@ void shrink_dcache_parent(struct dentry 
 {
 	int found;
 
-	while ((found = select_parent(parent)) != 0)
+	while ((found = select_parent(parent)) != 0) {
 		prune_dcache(found);
+		conditional_schedule();		/* Typically sys_rmdir() */
+	}
 }
 
 /*
--- linux-2.4.18-pre6/fs/exec.c	Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/exec.c	Wed Jan 23 11:13:00 2002
@@ -223,7 +223,7 @@ int copy_strings(int argc,char ** argv, 
 				if (new)
 					memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len);
 			}
-			err = copy_from_user(kaddr + offset, str, bytes_to_copy);
+			err = ll_copy_from_user(kaddr + offset, str, bytes_to_copy);
 			kunmap(page);
 
 			if (err)
--- linux-2.4.18-pre6/fs/ext2/dir.c	Mon Sep 17 13:16:30 2001
+++ linux-akpm/fs/ext2/dir.c	Wed Jan 23 11:13:00 2002
@@ -148,6 +148,7 @@ static struct page * ext2_get_page(struc
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_cache_page(mapping, n,
 				(filler_t*)mapping->a_ops->readpage, NULL);
+	conditional_schedule();		/* Scanning large directories */
 	if (!IS_ERR(page)) {
 		wait_on_page(page);
 		kmap(page);
--- linux-2.4.18-pre6/fs/ext2/inode.c	Thu Nov 22 23:02:58 2001
+++ linux-akpm/fs/ext2/inode.c	Wed Jan 23 11:13:00 2002
@@ -715,8 +715,13 @@ static inline void ext2_free_data(struct
 {
 	unsigned long block_to_free = 0, count = 0;
 	unsigned long nr;
+	DEFINE_RESCHED_COUNT;
 
 	for ( ; p < q ; p++) {
+		if (TEST_RESCHED_COUNT(32)) {
+			RESET_RESCHED_COUNT();
+			conditional_schedule();
+		}
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			*p = 0;
@@ -759,6 +764,7 @@ static void ext2_free_branches(struct in
 	if (depth--) {
 		int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
 		for ( ; p < q ; p++) {
+			conditional_schedule();		/* Deleting large files */
 			nr = le32_to_cpu(*p);
 			if (!nr)
 				continue;
--- linux-2.4.18-pre6/fs/inode.c	Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/inode.c	Wed Jan 23 11:13:00 2002
@@ -229,6 +229,8 @@ static inline void __sync_one(struct ino
 
 	filemap_fdatawait(inode->i_mapping);
 
+	conditional_schedule();
+
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_LOCK;
 	if (!(inode->i_state & I_FREEING)) {
@@ -539,6 +541,7 @@ static void dispose_list(struct list_hea
 
 	while ((inode_entry = head->next) != head)
 	{
+		conditional_schedule();
 		list_del(inode_entry);
 
 		inode = list_entry(inode_entry, struct inode, i_list);
@@ -567,9 +570,22 @@ static int invalidate_list(struct list_h
 		if (tmp == head)
 			break;
 		inode = list_entry(tmp, struct inode, i_list);
+
+		if (conditional_schedule_needed()) {
+			atomic_inc(&inode->i_count);
+			spin_unlock(&inode_lock);
+			unconditional_schedule();
+			spin_lock(&inode_lock);
+			atomic_dec(&inode->i_count);
+		}
+			
 		if (inode->i_sb != sb)
 			continue;
+		atomic_inc(&inode->i_count);
+		spin_unlock(&inode_lock);
 		invalidate_inode_buffers(inode);
+		spin_lock(&inode_lock);
+		atomic_dec(&inode->i_count);
 		if (!atomic_read(&inode->i_count)) {
 			list_del_init(&inode->i_hash);
 			list_del(&inode->i_list);
@@ -667,15 +683,28 @@ void prune_icache(int goal)
 	struct list_head *entry, *freeable = &list;
 	int count;
 	struct inode * inode;
+	int nr_to_scan = inodes_stat.nr_unused;
 
+resume:
 	spin_lock(&inode_lock);
-
 	count = 0;
 	entry = inode_unused.prev;
-	while (entry != &inode_unused)
-	{
+	while (entry != &inode_unused && nr_to_scan--) {
 		struct list_head *tmp = entry;
 
+		if (conditional_schedule_needed()) {
+			/*
+			 * Need to drop the lock.  Reposition
+			 * the list head so we start here next time.
+			 * This can corrupt the LRU nature of the
+			 * unused list, but this isn't very important.
+			 */
+			list_del(&inode_unused);
+			list_add(&inode_unused, entry);
+			spin_unlock(&inode_lock);
+			unconditional_schedule();
+			goto resume;
+		}
 		entry = entry->prev;
 		inode = INODE(tmp);
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK))
@@ -841,6 +870,8 @@ static struct inode * get_new_inode(stru
 	inode = alloc_inode();
 	if (inode) {
 		struct inode * old;
+
+		conditional_schedule();			/* sync_old_buffers */
 
 		spin_lock(&inode_lock);
 		/* We released the lock, so.. */
--- linux-2.4.18-pre6/fs/proc/array.c	Thu Oct 11 09:00:01 2001
+++ linux-akpm/fs/proc/array.c	Wed Jan 23 11:13:00 2002
@@ -412,9 +412,11 @@ static inline void statm_pte_range(pmd_t
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		pte_t page = *pte;
+		pte_t page;
 		struct page *ptpage;
 
+		conditional_schedule();		/* For `top' and `ps' */
+		page = *pte;
 		address += PAGE_SIZE;
 		pte++;
 		if (pte_none(page))
--- linux-2.4.18-pre6/fs/proc/generic.c	Fri Sep  7 10:53:59 2001
+++ linux-akpm/fs/proc/generic.c	Wed Jan 23 11:13:00 2002
@@ -98,6 +98,8 @@ proc_file_read(struct file * file, char 
 				retval = n;
 			break;
 		}
+
+		conditional_schedule();		/* Some /proc files are large */
 		
 		/* This is a hack to allow mangling of file pos independent
  		 * of actual bytes read.  Simply place the data at page,
--- linux-2.4.18-pre6/fs/reiserfs/bitmap.c	Fri Dec 21 11:19:22 2001
+++ linux-akpm/fs/reiserfs/bitmap.c	Wed Jan 23 11:13:00 2002
@@ -412,17 +412,27 @@ free_and_return:
     }
        
 
-    reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ;
+/* this check needs to go before preparing the buffer because that can
+** schedule when low-latency patches are in use.  It is ok if the buffer
+** is locked, preparing it will wait on it, and we handle the case where
+** this block was allocated while we sleep below.
+*/
+   RFALSE( is_reusable (s, search_start, 0) == 0,
+	    "vs-4140: bad block number found");
 
-    RFALSE( buffer_locked (SB_AP_BITMAP (s)[i]) || 
-	    is_reusable (s, search_start, 0) == 0,
-	    "vs-4140: bitmap block is locked or bad block number found");
+   reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ;
 
     /* if this bit was already set, we've scheduled, and someone else
     ** has allocated it.  loop around and try again
     */
     if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) {
 	reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ;
+	/* if this block has been allocated while we slept, it is
+	** impossible to find any more contiguous blocks for ourselves.
+	** If we are doing preallocation, give up now and return.
+	*/
+	if (for_prealloc)
+	    goto free_and_return ;
 	amount_needed++ ;
 	continue ;
     }    
--- linux-2.4.18-pre6/fs/reiserfs/buffer2.c	Fri Dec 21 11:19:22 2001
+++ linux-akpm/fs/reiserfs/buffer2.c	Wed Jan 23 11:13:00 2002
@@ -55,6 +55,7 @@ struct buffer_head  * reiserfs_bread (st
     PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
 
     result = bread (super -> s_dev, n_block, n_size);
+    conditional_schedule();
     PROC_INFO_INC( super, breads );
     PROC_EXP( if( kstat.context_swtch != ctx_switches ) 
 	      PROC_INFO_INC( super, bread_miss ) );
--- linux-2.4.18-pre6/fs/reiserfs/journal.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/fs/reiserfs/journal.c	Wed Jan 23 11:13:00 2002
@@ -574,6 +574,7 @@ inline void insert_journal_hash(struct r
 /* lock the current transaction */
 inline static void lock_journal(struct super_block *p_s_sb) {
   PROC_INFO_INC( p_s_sb, journal.lock_journal );
+  conditional_schedule();
   while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
     PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
     sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
@@ -704,6 +705,7 @@ reiserfs_panic(s, "journal-539: flush_co
 	mark_buffer_dirty(tbh) ;
       }
       ll_rw_block(WRITE, 1, &tbh) ;
+      conditional_schedule();
       count++ ;
       put_bh(tbh) ; /* once for our get_hash */
     } 
@@ -833,6 +835,7 @@ static int _update_journal_header_block(
     set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ;
     ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ;
     wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; 
+    conditional_schedule();
     if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) {
       printk( "reiserfs: journal-837: IO error during journal replay\n" );
       return -EIO ;
@@ -2092,6 +2095,7 @@ static int journal_join(struct reiserfs_
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
+  conditional_schedule();
   return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
 }
 
@@ -2232,6 +2236,7 @@ int journal_mark_dirty_nolog(struct reis
 }
 
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+  conditional_schedule();
   return do_journal_end(th, p_s_sb, nblocks, 0) ;
 }
 
@@ -2683,6 +2688,7 @@ void reiserfs_prepare_for_journal(struct
       RFALSE( buffer_locked(bh) && cur_tb != NULL,
 	      "waiting while do_balance was running\n") ;
       wait_on_buffer(bh) ;
+      conditional_schedule();
     }
     PROC_INFO_INC( p_s_sb, journal.prepare_retry );
     retry_count++ ;
@@ -2856,6 +2862,7 @@ printk("journal-2020: do_journal_end: BA
     /* copy all the real blocks into log area.  dirty log blocks */
     if (test_bit(BH_JDirty, &cn->bh->b_state)) {
       struct buffer_head *tmp_bh ;
+      conditional_schedule();		/* getblk can sleep, so... */
       tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + 
 		     ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), 
 				       p_s_sb->s_blocksize) ;
--- linux-2.4.18-pre6/fs/jbd/checkpoint.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/fs/jbd/checkpoint.c	Wed Jan 23 11:13:00 2002
@@ -431,7 +431,11 @@ int __journal_clean_checkpoint_list(jour
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret = 0;
+	int ll_retries = 4;		/* lowlatency addition */
 
+restart:
+	if (ll_retries-- == 0)
+		goto out;
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction == 0)
 		goto out;
@@ -451,6 +455,12 @@ int __journal_clean_checkpoint_list(jour
 				jh = next_jh;
 				next_jh = jh->b_cpnext;
 				ret += __try_to_free_cp_buf(jh);
+				if (conditional_schedule_needed()) {
+					spin_unlock(&journal_datalist_lock);
+					unconditional_schedule();
+					spin_lock(&journal_datalist_lock);
+					goto restart;
+				}
 			} while (jh != last_jh);
 		}
 	} while (transaction != last_transaction);
--- linux-2.4.18-pre6/fs/jbd/commit.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/fs/jbd/commit.c	Wed Jan 23 11:13:00 2002
@@ -212,6 +212,16 @@ write_out_data_locked:
 				__journal_remove_journal_head(bh);
 				refile_buffer(bh);
 				__brelse(bh);
+				if (conditional_schedule_needed()) {
+					if (commit_transaction->t_sync_datalist)
+						commit_transaction->t_sync_datalist =
+							next_jh;
+					if (bufs)
+						break;
+					spin_unlock(&journal_datalist_lock);
+					unconditional_schedule();
+					goto write_out_data;
+				}
 			}
 		}
 		if (bufs == ARRAY_SIZE(wbuf)) {
@@ -235,8 +245,7 @@ write_out_data_locked:
 		journal_brelse_array(wbuf, bufs);
 		lock_journal(journal);
 		spin_lock(&journal_datalist_lock);
-		if (bufs)
-			goto write_out_data_locked;
+		goto write_out_data_locked;
 	}
 
 	/*
@@ -272,6 +281,14 @@ sync_datalist_empty:
 	 */
 	while ((jh = commit_transaction->t_async_datalist)) {
 		struct buffer_head *bh = jh2bh(jh);
+		if (conditional_schedule_needed()) {
+			spin_unlock(&journal_datalist_lock);
+			unlock_journal(journal);
+			unconditional_schedule();
+			lock_journal(journal);
+			spin_lock(&journal_datalist_lock);
+			continue;	/* List may have changed */
+		}
 		if (buffer_locked(bh)) {
 			spin_unlock(&journal_datalist_lock);
 			unlock_journal(journal);
@@ -486,6 +503,8 @@ start_journal_io:
  wait_for_iobuf:
 	while (commit_transaction->t_iobuf_list != NULL) {
 		struct buffer_head *bh;
+
+		conditional_schedule();
 		jh = commit_transaction->t_iobuf_list->b_tprev;
 		bh = jh2bh(jh);
 		if (buffer_locked(bh)) {
@@ -622,6 +641,8 @@ skip_commit:
 	while (commit_transaction->t_forget) {
 		transaction_t *cp_transaction;
 		struct buffer_head *bh;
+
+		conditional_schedule();		/* journal is locked */
 
 		jh = commit_transaction->t_forget;
 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
--- linux-2.4.18-pre6/fs/ext3/balloc.c	Thu Nov 22 23:02:58 2001
+++ linux-akpm/fs/ext3/balloc.c	Wed Jan 23 11:13:00 2002
@@ -365,6 +365,9 @@ do_more:
 			}
 		}
 #endif
+		/* superblock lock is held, so this is safe */
+		conditional_schedule();
+
 		BUFFER_TRACE(bitmap_bh, "clear bit");
 		if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
 			ext3_error (sb, __FUNCTION__,
--- linux-2.4.18-pre6/fs/ext3/inode.c	Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/ext3/inode.c	Wed Jan 23 11:13:00 2002
@@ -905,6 +905,8 @@ struct buffer_head *ext3_bread(handle_t 
 
 	prev_blocks = inode->i_blocks;
 
+	conditional_schedule();		/* Reading large directories */
+
 	bh = ext3_getblk (handle, inode, block, create, err);
 	if (!bh)
 		return bh;
@@ -1600,6 +1602,7 @@ ext3_clear_blocks(handle_t *handle, stru
 	 */
 	for (p = first; p < last; p++) {
 		u32 nr = le32_to_cpu(*p);
+		conditional_schedule();
 		if (nr) {
 			struct buffer_head *bh;
 
@@ -1654,6 +1657,7 @@ static void ext3_free_data(handle_t *han
 	}
 
 	for (p = first; p < last; p++) {
+		conditional_schedule();
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			/* accumulate blocks to free if they're contiguous */
@@ -1718,6 +1722,8 @@ static void ext3_free_branches(handle_t 
 
 			/* Go read the buffer for the next level down */
 			bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize);
+
+			conditional_schedule();
 
 			/*
 			 * A read failure? Report error and clear slot
--- linux-2.4.18-pre6/fs/ext3/namei.c	Thu Nov 22 23:02:58 2001
+++ linux-akpm/fs/ext3/namei.c	Wed Jan 23 11:13:00 2002
@@ -157,6 +157,7 @@ restart:
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
 		wait_on_buffer(bh);
+		conditional_schedule();
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
 			brelse(bh);
--- linux-2.4.18-pre6/kernel/exit.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/kernel/exit.c	Wed Jan 23 11:13:00 2002
@@ -203,6 +203,7 @@ static inline void close_files(struct fi
 			}
 			i++;
 			set >>= 1;
+			conditional_schedule();		/* sys_exit, many files open */
 		}
 	}
 }
--- linux-2.4.18-pre6/kernel/ksyms.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/kernel/ksyms.c	Wed Jan 23 11:13:00 2002
@@ -439,6 +439,13 @@ EXPORT_SYMBOL(xtime);
 EXPORT_SYMBOL(do_gettimeofday);
 EXPORT_SYMBOL(do_settimeofday);
 
+#if LOWLATENCY_NEEDED
+EXPORT_SYMBOL(set_running_and_schedule);
+#ifdef CONFIG_LOLAT_SYSCTL
+EXPORT_SYMBOL(__enable_lowlatency);
+#endif
+#endif
+
 #if !defined(__ia64__)
 EXPORT_SYMBOL(loops_per_jiffy);
 #endif
--- linux-2.4.18-pre6/kernel/module.c	Thu Nov 22 23:02:59 2001
+++ linux-akpm/kernel/module.c	Wed Jan 23 11:13:00 2002
@@ -1174,6 +1174,11 @@ static void *s_start(struct seq_file *m,
 		return ERR_PTR(-ENOMEM);
 	lock_kernel();
 	for (v = module_list, n = *pos; v; n -= v->nsyms, v = v->next) {
+#if 0
+		/* We can't actually do this, because we'd create a
+		 * race against module unload.  Need a semaphore. */
+		conditional_schedule();
+#endif
 		if (n < v->nsyms) {
 			p->mod = v;
 			p->index = n;
--- linux-2.4.18-pre6/kernel/sched.c	Fri Dec 21 11:19:23 2001
+++ linux-akpm/kernel/sched.c	Wed Jan 23 11:13:00 2002
@@ -302,6 +302,17 @@ send_now_idle:
 		if (tsk->processor != this_cpu)
 			smp_send_reschedule(tsk->processor);
 	}
+#if LOWLATENCY_NEEDED
+	if (enable_lowlatency && (p->policy != SCHED_OTHER)) {
+		struct task_struct *t;
+		for (i = 0; i < smp_num_cpus; i++) {
+			cpu = cpu_logical_map(i);
+			t = cpu_curr(cpu);
+			if (t != tsk)
+				t->need_resched = 1;
+		}
+	}
+#endif
 	return;
 		
 
@@ -1342,3 +1353,93 @@ void __init sched_init(void)
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current, cpu);
 }
+
+#if LOWLATENCY_NEEDED
+#if LOWLATENCY_DEBUG
+
+static struct lolat_stats_t *lolat_stats_head;
+static spinlock_t lolat_stats_lock = SPIN_LOCK_UNLOCKED;
+
+void set_running_and_schedule(struct lolat_stats_t *stats)
+{
+	spin_lock(&lolat_stats_lock);
+	if (stats->visited == 0) {
+		stats->visited = 1;
+		stats->next = lolat_stats_head;
+		lolat_stats_head = stats;
+	}
+	stats->count++;
+	spin_unlock(&lolat_stats_lock);
+
+	if (current->state != TASK_RUNNING)
+		set_current_state(TASK_RUNNING);
+	schedule();
+}
+
+void show_lolat_stats(void)
+{
+	struct lolat_stats_t *stats = lolat_stats_head;
+
+	printk("Low latency scheduling stats:\n");
+	while (stats) {
+		printk("%s:%d: %lu\n", stats->file, stats->line, stats->count);
+		stats->count = 0;
+		stats = stats->next;
+	}
+}
+
+#else	/* LOWLATENCY_DEBUG */
+
+void set_running_and_schedule()
+{
+	if (current->state != TASK_RUNNING)
+		__set_current_state(TASK_RUNNING);
+	schedule();
+}
+
+#endif	/* LOWLATENCY_DEBUG */
+
+int ll_copy_to_user(void *to_user, const void *from, unsigned long len)
+{
+	while (len) {
+		unsigned long n_to_copy = len;
+		unsigned long remainder;
+
+		if (n_to_copy > 4096)
+			n_to_copy = 4096;
+		remainder = copy_to_user(to_user, from, n_to_copy);
+		if (remainder)
+			return remainder + len;
+		to_user = ((char *)to_user) + n_to_copy;
+		from = ((char *)from) + n_to_copy;
+		len -= n_to_copy;
+		conditional_schedule();
+	}
+	return 0;
+}
+
+int ll_copy_from_user(void *to, const void *from_user, unsigned long len)
+{
+	while (len) {
+		unsigned long n_to_copy = len;
+		unsigned long remainder;
+
+		if (n_to_copy > 4096)
+			n_to_copy = 4096;
+		remainder = copy_from_user(to, from_user, n_to_copy);
+		if (remainder)
+			return remainder + len;
+		to = ((char *)to) + n_to_copy;
+		from_user = ((char *)from_user) + n_to_copy;
+		len -= n_to_copy;
+		conditional_schedule();
+	}
+	return 0;
+}
+
+#ifdef CONFIG_LOLAT_SYSCTL
+struct low_latency_enable_struct __enable_lowlatency = { 0, };
+#endif
+
+#endif	/* LOWLATENCY_NEEDED */
+
--- linux-2.4.18-pre6/kernel/sysctl.c	Fri Dec 21 11:19:23 2001
+++ linux-akpm/kernel/sysctl.c	Wed Jan 23 11:13:00 2002
@@ -256,6 +256,10 @@ static ctl_table kern_table[] = {
 	{KERN_S390_USER_DEBUG_LOGGING,"userprocess_debug",
 	 &sysctl_userprocess_debug,sizeof(int),0644,NULL,&proc_dointvec},
 #endif
+#ifdef CONFIG_LOLAT_SYSCTL
+	{KERN_LOWLATENCY, "lowlatency", &enable_lowlatency, sizeof (int),
+	 0644, NULL, &proc_dointvec},
+#endif
 	{0}
 };
 
--- linux-2.4.18-pre6/mm/filemap.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/mm/filemap.c	Wed Jan 23 11:13:00 2002
@@ -187,6 +187,18 @@ void invalidate_inode_pages(struct inode
 		page = list_entry(curr, struct page, list);
 		curr = curr->next;
 
+		if (conditional_schedule_needed() && !TryLockPage(page)) {
+			/*
+			 * Page is locked.  We can drop the spinlocks
+			 */
+			spin_unlock(&pagecache_lock);
+			spin_unlock(&pagemap_lru_lock);
+			unconditional_schedule();
+			spin_lock(&pagemap_lru_lock);
+			spin_lock(&pagecache_lock);
+			UnlockPage(page);
+		}
+
 		/* We cannot invalidate something in dirty.. */
 		if (PageDirty(page))
 			continue;
@@ -256,12 +268,24 @@ static int truncate_list_pages(struct li
 	struct list_head *curr;
 	struct page * page;
 	int unlocked = 0;
+	int restart_count = 4;
 
  restart:
 	curr = head->prev;
 	while (curr != head) {
 		unsigned long offset;
 
+		if (conditional_schedule_needed() && restart_count) {
+			restart_count--;
+			list_del(head);
+			list_add(head, curr);		/* Restart on this page */
+			spin_unlock(&pagecache_lock);
+			unconditional_schedule();
+			spin_lock(&pagecache_lock);
+			unlocked = 1;
+			goto restart;
+		}
+
 		page = list_entry(curr, struct page, list);
 		offset = page->index;
 
@@ -294,13 +318,11 @@ static int truncate_list_pages(struct li
 			} else
  				wait_on_page(page);
 
-			page_cache_release(page);
-
-			if (current->need_resched) {
-				__set_current_state(TASK_RUNNING);
-				schedule();
+			if (LOWLATENCY_NEEDED) {
+				restart_count = 4;	/* We made progress */
 			}
 
+			page_cache_release(page);
 			spin_lock(&pagecache_lock);
 			goto restart;
 		}
@@ -509,6 +531,7 @@ static int do_buffer_fdatasync(struct li
 
 		page_cache_get(page);
 		spin_unlock(&pagecache_lock);
+		conditional_schedule();		/* sys_msync() (only used by minixfs, udf) */
 		lock_page(page);
 
 		/* The buffers could have been free'd while we waited for the page lock */
@@ -595,12 +618,14 @@ int filemap_fdatasync(struct address_spa
 		list_del(&page->list);
 		list_add(&page->list, &mapping->locked_pages);
 
-		if (!PageDirty(page))
-			continue;
-
 		page_cache_get(page);
 		spin_unlock(&pagecache_lock);
 
+		conditional_schedule();		/* sys_msync() */
+
+		if (!PageDirty(page))
+			goto clean;
+
 		lock_page(page);
 
 		if (PageDirty(page)) {
@@ -611,7 +636,7 @@ int filemap_fdatasync(struct address_spa
 				ret = err;
 		} else
 			UnlockPage(page);
-
+clean:
 		page_cache_release(page);
 		spin_lock(&pagecache_lock);
 	}
@@ -629,7 +654,8 @@ int filemap_fdatasync(struct address_spa
 int filemap_fdatawait(struct address_space * mapping)
 {
 	int ret = 0;
-
+	DEFINE_RESCHED_COUNT;
+restart:
 	spin_lock(&pagecache_lock);
 
         while (!list_empty(&mapping->locked_pages)) {
@@ -638,6 +664,17 @@ int filemap_fdatawait(struct address_spa
 		list_del(&page->list);
 		list_add(&page->list, &mapping->clean_pages);
 
+		if (TEST_RESCHED_COUNT(32)) {
+			RESET_RESCHED_COUNT();
+			if (conditional_schedule_needed()) {
+				page_cache_get(page);
+				spin_unlock(&pagecache_lock);
+				unconditional_schedule();
+				page_cache_release(page);
+				goto restart;
+			}
+		}
+
 		if (!PageLocked(page))
 			continue;
 
@@ -738,8 +775,10 @@ static int page_cache_read(struct file *
 	spin_lock(&pagecache_lock);
 	page = __find_page_nolock(mapping, offset, *hash);
 	spin_unlock(&pagecache_lock);
-	if (page)
+	if (page) {
+		conditional_schedule();
 		return 0;
+	}
 
 	page = page_cache_alloc(mapping);
 	if (!page)
@@ -904,6 +943,11 @@ static struct page * __find_lock_page_he
 	 * the hash-list needs a held write-lock.
 	 */
 repeat:
+	if (conditional_schedule_needed()) {
+		spin_unlock(&pagecache_lock);
+		unconditional_schedule();
+		spin_lock(&pagecache_lock);
+	}
 	page = __find_page_nolock(mapping, offset, hash);
 	if (page) {
 		page_cache_get(page);
@@ -1368,6 +1412,8 @@ found_page:
 		page_cache_get(page);
 		spin_unlock(&pagecache_lock);
 
+		conditional_schedule();		/* sys_read() */
+
 		if (!Page_Uptodate(page))
 			goto page_not_up_to_date;
 		generic_file_readahead(reada_ok, filp, inode, page);
@@ -2066,6 +2112,12 @@ static inline int filemap_sync_pte_range
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+
+	if (conditional_schedule_needed()) {
+		spin_unlock(&vma->vm_mm->page_table_lock);
+		unconditional_schedule();		/* syncing large mapped files */
+		spin_lock(&vma->vm_mm->page_table_lock);
+	}
 	return error;
 }
 
@@ -2473,7 +2525,9 @@ static long madvise_dontneed(struct vm_a
 	if (vma->vm_flags & VM_LOCKED)
 		return -EINVAL;
 
-	zap_page_range(vma->vm_mm, start, end - start);
+        zap_page_range(vma->vm_mm, start, end - start,
+		ZPR_COND_RESCHED);        /* sys_madvise(MADV_DONTNEED) */
+
 	return 0;
 }
 
@@ -3038,6 +3092,9 @@ generic_file_write(struct file *file,con
 			goto sync_failure;
 		page_fault = __copy_from_user(kaddr+offset, buf, bytes);
 		flush_dcache_page(page);
+
+                conditional_schedule();
+
 		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
 		if (page_fault)
 			goto fail_write;
--- linux-2.4.18-pre6/mm/memory.c	Fri Dec 21 11:19:23 2001
+++ linux-akpm/mm/memory.c	Wed Jan 23 11:13:00 2002
@@ -355,7 +355,7 @@ static inline int zap_pmd_range(mmu_gath
 /*
  * remove user pages in a given range.
  */
-void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
+static void do_zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 {
 	mmu_gather_t *tlb;
 	pgd_t * dir;
@@ -465,6 +465,10 @@ int get_user_pages(struct task_struct *t
 			struct page *map;
 			while (!(map = follow_page(mm, start, write))) {
 				spin_unlock(&mm->page_table_lock);
+
+				/* Pinning down many physical pages (kiobufs, mlockall) */
+				conditional_schedule();
+
 				switch (handle_mm_fault(mm, vma, start, write)) {
 				case 1:
 					tsk->min_flt++;
@@ -609,6 +613,21 @@ void unmap_kiobuf (struct kiobuf *iobuf)
 	iobuf->locked = 0;
 }
 
+#define MAX_ZAP_BYTES 256*PAGE_SIZE
+
+void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions)
+{
+	while (size) {
+		unsigned long chunk = size;
+		if (actions & ZPR_COND_RESCHED && chunk > MAX_ZAP_BYTES)
+			chunk = MAX_ZAP_BYTES;
+		do_zap_page_range(mm, address, chunk);
+		if (actions & ZPR_COND_RESCHED)
+			conditional_schedule();
+		address += chunk;
+		size -= chunk;
+	}
+}
 
 /*
  * Lock down all of the pages of a kiovec for IO.
@@ -718,11 +737,18 @@ int unlock_kiovec(int nr, struct kiobuf 
 	return 0;
 }
 
-static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
-                                     unsigned long size, pgprot_t prot)
+static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte,
+				unsigned long address, unsigned long size,
+				pgprot_t prot)
 {
 	unsigned long end;
 
+	if (conditional_schedule_needed()) {
+		spin_unlock(&mm->page_table_lock);
+		unconditional_schedule();		/* mmap(/dev/zero) */
+		spin_lock(&mm->page_table_lock);
+	}
+
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -750,7 +776,7 @@ static inline int zeromap_pmd_range(stru
 		pte_t * pte = pte_alloc(mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
-		zeromap_pte_range(pte, address, end - address, prot);
+		zeromap_pte_range(mm, pte, address, end - address, prot);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -984,7 +1010,7 @@ static void vmtruncate_list(struct vm_ar
 
 		/* mapping wholly truncated? */
 		if (mpnt->vm_pgoff >= pgoff) {
-			zap_page_range(mm, start, len);
+                        zap_page_range(mm, start, len, 0);
 			continue;
 		}
 
@@ -997,7 +1023,7 @@ static void vmtruncate_list(struct vm_ar
 		/* Ok, partially affected.. */
 		start += diff << PAGE_SHIFT;
 		len = (len - diff) << PAGE_SHIFT;
-		zap_page_range(mm, start, len);
+                zap_page_range(mm, start, len, 0);
 	} while ((mpnt = mpnt->vm_next_share) != NULL);
 }
 
--- linux-2.4.18-pre6/mm/mmap.c	Mon Nov  5 21:01:12 2001
+++ linux-akpm/mm/mmap.c	Wed Jan 23 11:13:00 2002
@@ -569,7 +569,7 @@ unmap_and_free_vma:
 	fput(file);
 
 	/* Undo any partial mapping done by a device driver. */
-	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+        zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, 0);
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 	return error;
@@ -967,7 +967,7 @@ int do_munmap(struct mm_struct *mm, unsi
 		remove_shared_vm_struct(mpnt);
 		mm->map_count--;
 
-		zap_page_range(mm, st, size);
+                zap_page_range(mm, st, size, ZPR_COND_RESCHED);   /* sys_munmap() */
 
 		/*
 		 * Fix the mapping, and free the old area if it wasn't reused.
@@ -1127,7 +1127,7 @@ void exit_mmap(struct mm_struct * mm)
 		}
 		mm->map_count--;
 		remove_shared_vm_struct(mpnt);
-		zap_page_range(mm, start, size);
+		zap_page_range(mm, start, size, ZPR_COND_RESCHED);      /* sys_exit() */
 		if (mpnt->vm_file)
 			fput(mpnt->vm_file);
 		kmem_cache_free(vm_area_cachep, mpnt);
--- linux-2.4.18-pre6/mm/mremap.c	Thu Sep 20 20:31:26 2001
+++ linux-akpm/mm/mremap.c	Wed Jan 23 11:13:00 2002
@@ -118,7 +118,7 @@ oops_we_failed:
 	flush_cache_range(mm, new_addr, new_addr + len);
 	while ((offset += PAGE_SIZE) < len)
 		move_one_page(mm, new_addr + offset, old_addr + offset);
-	zap_page_range(mm, new_addr, len);
+        zap_page_range(mm, new_addr, len, 0);
 	return -1;
 }
 
--- linux-2.4.18-pre6/mm/slab.c	Fri Dec 21 11:19:23 2001
+++ linux-akpm/mm/slab.c	Wed Jan 23 11:13:00 2002
@@ -937,6 +937,7 @@ static int __kmem_cache_shrink(kmem_cach
 
 		spin_unlock_irq(&cachep->spinlock);
 		kmem_slab_destroy(cachep, slabp);
+		conditional_schedule();		/* Can take 30 milliseconds */
 		spin_lock_irq(&cachep->spinlock);
 	}
 	ret = !list_empty(&cachep->slabs_full) || !list_empty(&cachep->slabs_partial);
@@ -1821,6 +1822,7 @@ perfect:
 		 */
 		spin_unlock_irq(&best_cachep->spinlock);
 		kmem_slab_destroy(best_cachep, slabp);
+		conditional_schedule();		/* try_to_free_pages() */
 		spin_lock_irq(&best_cachep->spinlock);
 	}
 	spin_unlock_irq(&best_cachep->spinlock);
--- linux-2.4.18-pre6/mm/swapfile.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/mm/swapfile.c	Wed Jan 23 11:13:00 2002
@@ -823,7 +823,7 @@ int get_swaparea_info(char *buf)
 				len += sprintf(buf + len, "partition\t");
 
 			usedswap = 0;
-			for (j = 0; j < ptr->max; ++j)
+			for (j = 0; j < ptr->max; ++j) {
 				switch (ptr->swap_map[j]) {
 					case SWAP_MAP_BAD:
 					case 0:
@@ -831,6 +831,8 @@ int get_swaparea_info(char *buf)
 					default:
 						usedswap++;
 				}
+				conditional_schedule();
+			}
 			len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), 
 				usedswap << (PAGE_SHIFT - 10), ptr->prio);
 		}
@@ -1124,6 +1126,11 @@ void si_swapinfo(struct sysinfo *val)
 		if (swap_info[i].flags != SWP_USED)
 			continue;
 		for (j = 0; j < swap_info[i].max; ++j) {
+			if (conditional_schedule_needed()) {
+				swap_list_unlock();
+				conditional_schedule();
+				swap_list_lock();
+			}
 			switch (swap_info[i].swap_map[j]) {
 				case 0:
 				case SWAP_MAP_BAD:
--- linux-2.4.18-pre6/mm/vmscan.c	Tue Jan 22 12:38:31 2002
+++ linux-akpm/mm/vmscan.c	Wed Jan 23 11:13:00 2002
@@ -157,6 +157,7 @@ static inline int swap_out_pmd(struct mm
 {
 	pte_t * pte;
 	unsigned long pmd_end;
+	DEFINE_RESCHED_COUNT;
 
 	if (pmd_none(*dir))
 		return count;
@@ -182,11 +183,17 @@ static inline int swap_out_pmd(struct mm
 					address += PAGE_SIZE;
 					break;
 				}
+                                if (TEST_RESCHED_COUNT(4)) {
+                                        if (conditional_schedule_needed())
+						goto out;
+                                        RESET_RESCHED_COUNT();
+                                }
 			}
 		}
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+out:
 	mm->swap_address = address;
 	return count;
 }
@@ -215,6 +222,8 @@ static inline int swap_out_pgd(struct mm
 		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
 		if (!count)
 			break;
+		if (conditional_schedule_needed())
+			return count;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -240,6 +249,8 @@ static inline int swap_out_vma(struct mm
 		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
 		if (!count)
 			break;
+		if (conditional_schedule_needed())
+			return count;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (address && (address < end));
@@ -261,6 +272,7 @@ static inline int swap_out_mm(struct mm_
 	 * Find the proper vm-area after freezing the vma chain 
 	 * and ptes.
 	 */
+continue_scan:
 	spin_lock(&mm->page_table_lock);
 	address = mm->swap_address;
 	if (address == TASK_SIZE || swap_mm != mm) {
@@ -278,6 +290,12 @@ static inline int swap_out_mm(struct mm_
 			vma = vma->vm_next;
 			if (!vma)
 				break;
+                        if (conditional_schedule_needed()) {    /* Scanning a large vma */
+                                spin_unlock(&mm->page_table_lock);
+                                unconditional_schedule();
+                                /* Continue from where we left off */
+                                goto continue_scan;
+                        }
 			if (!count)
 				goto out_unlock;
 			address = vma->vm_start;
--- linux-2.4.18-pre6/net/core/iovec.c	Mon Sep 10 07:57:00 2001
+++ linux-akpm/net/core/iovec.c	Wed Jan 23 11:13:00 2002
@@ -88,7 +88,7 @@ int memcpy_toiovec(struct iovec *iov, un
 		if(iov->iov_len)
 		{
 			int copy = min_t(unsigned int, iov->iov_len, len);
-			if (copy_to_user(iov->iov_base, kdata, copy))
+                        if (ll_copy_to_user(iov->iov_base, kdata, copy))
 				goto out;
 			kdata+=copy;
 			len-=copy;
--- linux-2.4.18-pre6/net/ipv4/tcp_minisocks.c	Mon Oct  1 09:19:57 2001
+++ linux-akpm/net/ipv4/tcp_minisocks.c	Wed Jan 23 11:13:00 2002
@@ -434,6 +434,9 @@ static void SMP_TIMER_NAME(tcp_twkill)(u
 {
 	struct tcp_tw_bucket *tw;
 	int killed = 0;
+#if LOWLATENCY_NEEDED
+	int max_killed = 0;
+#endif
 
 	/* NOTE: compare this to previous version where lock
 	 * was released after detaching chain. It was racy,
@@ -447,6 +450,13 @@ static void SMP_TIMER_NAME(tcp_twkill)(u
 		goto out;
 
 	while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
+#if LOWLATENCY_NEEDED
+		/* This loop takes ~6 usecs per iteration. */
+		if (killed > 100) {
+			max_killed = 1;
+			break;
+		}
+#endif
 		tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
 		tw->pprev_death = NULL;
 		spin_unlock(&tw_death_lock);
@@ -457,12 +467,24 @@ static void SMP_TIMER_NAME(tcp_twkill)(u
 		killed++;
 
 		spin_lock(&tw_death_lock);
+
+	}
+
+#if LOWLATENCY_NEEDED
+	if (max_killed) {	/* More to do: do it soon */
+		mod_timer(&tcp_tw_timer, jiffies+2);
+		tcp_tw_count -= killed;
+	}
+	else
+#endif
+	{
+		tcp_tw_death_row_slot =
+			((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+	
+		if ((tcp_tw_count -= killed) != 0)
+			mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 	}
-	tcp_tw_death_row_slot =
-		((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 
-	if ((tcp_tw_count -= killed) != 0)
-		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 	net_statistics[smp_processor_id()*2].TimeWaited += killed;
 out:
 	spin_unlock(&tw_death_lock);