This is a patch which improves the kernel's scheduling latency under heavy filesystem activity. I am proposing this for the 2.4 series. Parts of it are from the -aa patchset. It adds ten rescheduling points and pops spinlocks in two places. It's all pretty obvious and safe, apart from the ext3 chunk which required some thought. Testing was on an 850MHz PIII with 768 megabytes of RAM. The workload executed was: cd /usr/src/linux make -j3 bzImage make clean cd /mnt/hda7 tar xfz /nfs-mountpoint/linux-2.4.10.tar.gz rm -rf linux dd if=/dev/zero of=foo bs=1M count=400 rm foo cvs co dbench cd dbench make ./dbench 40 sync cd .. rm -rf dbench In conjunction with this, the `realfeel2' app from http://www.zip.com.au/~akpm/linux/amlat.tar.gz was used to accumulate a histogram of the latency between occurrence of an interrupt and the scheduling of the userspace process which was woken as a result of that interrupt. Testing is against 2.4.18-pre1. The table shows the number of times a particular latency was encountered with this workload, for various patchsets. The filesystem was ext3. ext2 does better. Latency (milliseconds) Kernel 0-1 1-2 2-4 4-8 8-16 16-32 32-64 64-128 128-256 Stock 1.2e6 365 32 33 417 11 11 4 rml-preempt 5.0e6 119 26 22 11 8 13 3 1 mini-ll 1.8e6 217 15 15 8 9 1 mini-ll+rml-preempt 1.4e6 96 14 7 4 2 5 1 low-latency 1.3e6 18 1 The 5.0e6 count on the second row is due to extra idle time (I hadn't scripted the test). This is a logarithmic representation, so the mini-ll improvements do not look really fantastic, but in fact it is a very considerable improvement. Worst-case latency is 36 milliseconds (versus 84 for the stock kernel). Bear in mind that the intent here is mouse-no-jerky and less audio dropouts. If the requirement is for very low worst-case, you'll still need a low-latency patch. Here's the full histogram for the latest low-latency patch: 0.0 1271218 0.1 4554 0.2 472 0.3 78 0.4 10 0.5 5 0.6 24 0.7 27 0.8 20 0.9 10 1.0 15 1.1 1 1.2 1 1.7 1 3.2 1 Which is butt-kicking. It can be improved by minimising the amount of online swapspace. scan_swap_map(). --- linux-2.4.18-pre1/fs/buffer.c Fri Dec 21 11:19:14 2001 +++ linux-akpm/fs/buffer.c Sun Jan 6 01:06:41 2002 @@ -249,12 +249,19 @@ static int wait_for_buffers(kdev_t dev, struct buffer_head * next; int nr; - next = lru_list[index]; nr = nr_buffers_type[index]; +repeat: + next = lru_list[index]; while (next && --nr >= 0) { struct buffer_head *bh = next; next = bh->b_next_free; + if (dev == NODEV && current->need_resched) { + spin_unlock(&lru_list_lock); + conditional_schedule(); + spin_lock(&lru_list_lock); + goto repeat; + } if (!buffer_locked(bh)) { if (refile) __refile_buffer(bh); @@ -1174,8 +1181,10 @@ struct buffer_head * bread(kdev_t dev, i bh = getblk(dev, block, size); touch_buffer(bh); - if (buffer_uptodate(bh)) + if (buffer_uptodate(bh)) { + conditional_schedule(); return bh; + } ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) --- linux-2.4.18-pre1/fs/dcache.c Fri Dec 21 11:19:14 2001 +++ linux-akpm/fs/dcache.c Sat Jan 5 23:49:00 2002 @@ -71,7 +71,7 @@ static inline void d_free(struct dentry * d_iput() operation if defined. * Called with dcache_lock held, drops it. */ -static inline void dentry_iput(struct dentry * dentry) +static void dentry_iput(struct dentry * dentry) { struct inode *inode = dentry->d_inode; if (inode) { @@ -84,6 +84,7 @@ static inline void dentry_iput(struct de iput(inode); } else spin_unlock(&dcache_lock); + conditional_schedule(); } /* --- linux-2.4.18-pre1/fs/jbd/commit.c Fri Dec 21 11:19:14 2001 +++ linux-akpm/fs/jbd/commit.c Sun Jan 6 02:43:43 2002 @@ -212,6 +212,16 @@ write_out_data_locked: __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); + if (current->need_resched) { + if (commit_transaction->t_sync_datalist) + commit_transaction->t_sync_datalist = + next_jh; + if (bufs) + break; + spin_unlock(&journal_datalist_lock); + conditional_schedule(); + goto write_out_data; + } } } if (bufs == ARRAY_SIZE(wbuf)) { --- linux-2.4.18-pre1/fs/proc/array.c Thu Oct 11 09:00:01 2001 +++ linux-akpm/fs/proc/array.c Sat Jan 5 23:49:00 2002 @@ -415,6 +415,8 @@ static inline void statm_pte_range(pmd_t pte_t page = *pte; struct page *ptpage; + conditional_schedule(); + address += PAGE_SIZE; pte++; if (pte_none(page)) --- linux-2.4.18-pre1/fs/proc/generic.c Fri Sep 7 10:53:59 2001 +++ linux-akpm/fs/proc/generic.c Sat Jan 5 23:49:00 2002 @@ -98,7 +98,9 @@ proc_file_read(struct file * file, char retval = n; break; } - + + conditional_schedule(); + /* This is a hack to allow mangling of file pos independent * of actual bytes read. Simply place the data at page, * return the bytes, and set `start' to the desired offset --- linux-2.4.18-pre1/include/linux/condsched.h Thu Jan 1 00:00:00 1970 +++ linux-akpm/include/linux/condsched.h Sat Jan 5 23:49:00 2002 @@ -0,0 +1,18 @@ +#ifndef _LINUX_CONDSCHED_H +#define _LINUX_CONDSCHED_H + +#ifndef __LINUX_COMPILER_H +#include +#endif + +#ifndef __ASSEMBLY__ +#define conditional_schedule() \ +do { \ + if (unlikely(current->need_resched)) { \ + __set_current_state(TASK_RUNNING); \ + schedule(); \ + } \ +} while(0) +#endif + +#endif --- linux-2.4.18-pre1/include/linux/sched.h Fri Dec 21 11:19:23 2001 +++ linux-akpm/include/linux/sched.h Sun Jan 6 02:44:06 2002 @@ -13,6 +13,7 @@ extern unsigned long event; #include #include #include +#include #include #include --- linux-2.4.18-pre1/mm/filemap.c Wed Dec 26 11:47:41 2001 +++ linux-akpm/mm/filemap.c Sat Jan 5 23:49:00 2002 @@ -296,10 +296,7 @@ static int truncate_list_pages(struct li page_cache_release(page); - if (current->need_resched) { - __set_current_state(TASK_RUNNING); - schedule(); - } + conditional_schedule(); spin_lock(&pagecache_lock); goto restart; @@ -609,6 +606,7 @@ void filemap_fdatasync(struct address_sp UnlockPage(page); page_cache_release(page); + conditional_schedule(); spin_lock(&pagecache_lock); } spin_unlock(&pagecache_lock); @@ -1392,6 +1390,9 @@ page_ok: offset &= ~PAGE_CACHE_MASK; page_cache_release(page); + + conditional_schedule(); + if (ret == nr && desc->count) continue; break; @@ -3025,6 +3026,8 @@ unlock: SetPageReferenced(page); UnlockPage(page); page_cache_release(page); + + conditional_schedule(); if (status < 0) break; --- linux-2.4.18-pre1/drivers/block/ll_rw_blk.c Mon Nov 5 21:01:11 2001 +++ linux-akpm/drivers/block/ll_rw_blk.c Sat Jan 5 23:49:00 2002 @@ -913,6 +913,7 @@ void submit_bh(int rw, struct buffer_hea kstat.pgpgin += count; break; } + conditional_schedule(); } /** - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/