# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1014 -> 1.1015 # fs/ext2/inode.c 1.17 -> 1.18 # fs/ext2/balloc.c 1.4 -> 1.5 # fs/ext3/ialloc.c 1.6 -> 1.7 # fs/ext2/ialloc.c 1.11 -> 1.12 # fs/ext3/inode.c 1.15 -> 1.16 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/03/12 tytso@think.thunk.org 1.1015 # Andrew Morton's ext2/3 inode/block allocator improvements # # ext2 places non-directory objects into the same blockgroup as their # directory, as long as that directory has free inodes. It does this # even if there are no free blocks in that blockgroup (!). # # This means that if there are lots of files being created at a common # point in the tree, they _all_ have the same starting blockgroup. For # each file we do a big search forwards for the first block and the # allocations end up getting intermingled. # # So this patch will avoid placing new inodes in block groups which have # no free blocks. # # So far so good. But this means that if a lot of new files are being # created under a directory (or multiple directories) which are in the # same blockgroup, all the new inodes will overflow into the same # blockgroup. No improvement at all. # # So the patch arranges for the new inode locations to be "spread out" # across different blockgroups if they are not going to be placed in # their directory's block group. This is done by adding parent->i_ino # into the starting point for the quadratic hash. i_ino was chosen so # that files which are in the same directory will tend to all land in the # same new blockgroup. # # -------------------------------------------------- # # When an ext3 (or ext2) file is first created the filesystem has to # choose the initial starting block for its data allocations. In the # usual (new-file) case, that initial goal block is the zeroeth block of # a particular blockgroup. # # This is the worst possible choice. Because it _guarantees_ that this # file's blocks will be pessimally intermingled with the blocks of # another file which is growing within the same blockgroup. # # We've always had this problem with files in the same directory. With # the introduction of the Orlov allocator we now have the problem with # files in different directories. And it got noticed. This is the cause # of the post-Orlov 50% slowdown in dbench throughput on ext3 on # write-through caching SCSI on SMP. And 25% in ext2. # # It doesn't happen on uniprocessor because a single CPU will not exhibit # sufficient concurrency in allocation against two or more files. # # It will happen on uniprocessor if the files are growing slowly. # # It has always happened if the files are in the same directory. # # ext2 has the same problem but it is siginficantly less damaging there # because of ext2's eight-block per-inode preallocation window. # # The patch largely solves this problem by not always starting the # allocation goal at the zeroeth block of the blockgroup. We instead # chop the blockgroup into sixteen starting points and select one of those # based on the lower four bits of the calling process's PID. # # The PID was chosen as the index because this will help to ensure that # related files have the same starting goal. If one process is slowly # writing two files in the same directory, we still lose. # # Using the PID in the heuristic is a bit weird. As an alternative I # tried using the file's directory's i_ino. That fixed the dbench # problem OK but caused a 15% slowdown in the fast-growth `untar a kernel # tree' workload. Because this approach will cause files which are in # different directories to spread out more. Suppressing that behaviour # when the files are all being created by the same process is a # reasonable heuristic. # # I changed dbench to never unlink its files, and used e2fsck to # determine how many fragmented files were present after a `dbench 32' # run. With this patch and the next couple, ext2's fragmentation went # from 22% to 13% and ext3's from 25% to 10.4%. # # ------------------------ # # Renames the local variables `bh2', `i', `j', 'k', and `tmp' to # something meanigful. This brings ext2_new_block() into line with # ext3_new_block(). # -------------------------------------------- # --- linux-2.4-ext3merge/fs/ext2/balloc.c.=K0015=.orig 2003-03-13 16:22:52.000000000 +0000 +++ linux-2.4-ext3merge/fs/ext2/balloc.c 2003-03-14 18:19:13.000000000 +0000 @@ -358,9 +358,12 @@ int ext2_new_block (struct inode * inode u32 * prealloc_count, u32 * prealloc_block, int * err) { struct buffer_head * bh; - struct buffer_head * bh2; + struct buffer_head *gdp_bh; /* bh2 */ char * p, * r; - int i, j, k, tmp; + int group_no; /* i */ + int ret_block; /* j */ + int bit; /* k */ + int target_block; /* tmp */ int bitmap_nr; struct super_block * sb; struct ext2_group_desc * gdp; @@ -393,30 +396,30 @@ repeat: if (goal < le32_to_cpu(es->s_first_data_block) || goal >= le32_to_cpu(es->s_blocks_count)) goal = le32_to_cpu(es->s_first_data_block); - i = (goal - le32_to_cpu(es->s_first_data_block)) / EXT2_BLOCKS_PER_GROUP(sb); - gdp = ext2_get_group_desc (sb, i, &bh2); + group_no = (goal - le32_to_cpu(es->s_first_data_block)) / EXT2_BLOCKS_PER_GROUP(sb); + gdp = ext2_get_group_desc (sb, group_no, &gdp_bh); if (!gdp) goto io_error; if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { - j = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT2_BLOCKS_PER_GROUP(sb)); + ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT2_BLOCKS_PER_GROUP(sb)); #ifdef EXT2FS_DEBUG - if (j) + if (ret_block) goal_attempts++; #endif - bitmap_nr = load_block_bitmap (sb, i); + bitmap_nr = load_block_bitmap (sb, group_no); if (bitmap_nr < 0) goto io_error; bh = sb->u.ext2_sb.s_block_bitmap[bitmap_nr]; - ext2_debug ("goal is at %d:%d.\n", i, j); + ext2_debug ("goal is at %d:%d.\n", group_no, ret_block); - if (!ext2_test_bit(j, bh->b_data)) { + if (!ext2_test_bit(ret_block, bh->b_data)) { ext2_debug("goal bit allocated, %d hits\n",++goal_hits); goto got_block; } - if (j) { + if (ret_block) { /* * The goal was occupied; search forward for a free * block within the next XX blocks. @@ -425,9 +428,9 @@ repeat: * less than EXT2_BLOCKS_PER_GROUP. Aligning up to the * next 64-bit boundary is simple.. */ - int end_goal = (j + 63) & ~63; - j = ext2_find_next_zero_bit(bh->b_data, end_goal, j); - if (j < end_goal) + int end_goal = (ret_block + 63) & ~63; + ret_block = ext2_find_next_zero_bit(bh->b_data, end_goal, ret_block); + if (ret_block < end_goal) goto got_block; } @@ -442,19 +445,19 @@ repeat: * Search first in the remainder of the current group; then, * cyclicly search through the rest of the groups. */ - p = ((char *) bh->b_data) + (j >> 3); - r = memscan(p, 0, (EXT2_BLOCKS_PER_GROUP(sb) - j + 7) >> 3); - k = (r - ((char *) bh->b_data)) << 3; - if (k < EXT2_BLOCKS_PER_GROUP(sb)) { - j = k; + p = ((char *) bh->b_data) + (ret_block >> 3); + r = memscan(p, 0, (EXT2_BLOCKS_PER_GROUP(sb) - ret_block + 7) >> 3); + bit = (r - ((char *) bh->b_data)) << 3; + if (bit < EXT2_BLOCKS_PER_GROUP(sb)) { + ret_block = bit; goto search_back; } - k = ext2_find_next_zero_bit ((unsigned long *) bh->b_data, + bit = ext2_find_next_zero_bit ((unsigned long *) bh->b_data, EXT2_BLOCKS_PER_GROUP(sb), - j); - if (k < EXT2_BLOCKS_PER_GROUP(sb)) { - j = k; + ret_block); + if (bit < EXT2_BLOCKS_PER_GROUP(sb)) { + ret_block = bit; goto got_block; } } @@ -465,33 +468,34 @@ repeat: * Now search the rest of the groups. We assume that * i and gdp correctly point to the last group visited. */ - for (k = 0; k < sb->u.ext2_sb.s_groups_count; k++) { - i++; - if (i >= sb->u.ext2_sb.s_groups_count) - i = 0; - gdp = ext2_get_group_desc (sb, i, &bh2); + for (bit = 0; bit < sb->u.ext2_sb.s_groups_count; bit++) { + group_no++; + if (group_no >= sb->u.ext2_sb.s_groups_count) + group_no = 0; + gdp = ext2_get_group_desc (sb, group_no, &gdp_bh); if (!gdp) goto io_error; if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) break; } - if (k >= sb->u.ext2_sb.s_groups_count) + if (bit >= sb->u.ext2_sb.s_groups_count) goto out; - bitmap_nr = load_block_bitmap (sb, i); + bitmap_nr = load_block_bitmap (sb, group_no); if (bitmap_nr < 0) goto io_error; bh = sb->u.ext2_sb.s_block_bitmap[bitmap_nr]; r = memscan(bh->b_data, 0, EXT2_BLOCKS_PER_GROUP(sb) >> 3); - j = (r - bh->b_data) << 3; - if (j < EXT2_BLOCKS_PER_GROUP(sb)) + ret_block = (r - bh->b_data) << 3; + if (ret_block < EXT2_BLOCKS_PER_GROUP(sb)) goto search_back; else - j = ext2_find_first_zero_bit ((unsigned long *) bh->b_data, + ret_block = ext2_find_first_zero_bit ((unsigned long *) bh->b_data, EXT2_BLOCKS_PER_GROUP(sb)); - if (j >= EXT2_BLOCKS_PER_GROUP(sb)) { + if (ret_block >= EXT2_BLOCKS_PER_GROUP(sb)) { ext2_error (sb, "ext2_new_block", - "Free blocks count corrupted for block group %d", i); + "Free blocks count corrupted for block group %d", + group_no); goto out; } @@ -501,7 +505,7 @@ search_back: * bitmap. Now search backwards up to 7 bits to find the * start of this group of free blocks. */ - for (k = 0; k < 7 && j > 0 && !ext2_test_bit (j - 1, bh->b_data); k++, j--); + for (bit = 0; bit < 7 && ret_block > 0 && !ext2_test_bit (ret_block - 1, bh->b_data); bit++, ret_block--); got_block: @@ -515,24 +519,24 @@ got_block: goto out; } - tmp = j + i * EXT2_BLOCKS_PER_GROUP(sb) + le32_to_cpu(es->s_first_data_block); + target_block = ret_block + group_no * EXT2_BLOCKS_PER_GROUP(sb) + le32_to_cpu(es->s_first_data_block); - if (tmp == le32_to_cpu(gdp->bg_block_bitmap) || - tmp == le32_to_cpu(gdp->bg_inode_bitmap) || - in_range (tmp, le32_to_cpu(gdp->bg_inode_table), + if (target_block == le32_to_cpu(gdp->bg_block_bitmap) || + target_block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range (target_block, le32_to_cpu(gdp->bg_inode_table), sb->u.ext2_sb.s_itb_per_group)) ext2_error (sb, "ext2_new_block", "Allocating block in system zone - " - "block = %u", tmp); + "block = %u", target_block); - if (ext2_set_bit (j, bh->b_data)) { + if (ext2_set_bit (ret_block, bh->b_data)) { ext2_warning (sb, "ext2_new_block", - "bit already set for block %d", j); + "bit already set for block %d", ret_block); DQUOT_FREE_BLOCK(inode, 1); goto repeat; } - ext2_debug ("found bit %d\n", j); + ext2_debug ("found bit %d\n", ret_block); /* * Do block preallocation now if required. @@ -541,21 +545,21 @@ got_block: /* Writer: ->i_prealloc* */ if (prealloc_count && !*prealloc_count) { int prealloc_goal; - unsigned long next_block = tmp + 1; + unsigned long next_block = target_block + 1; prealloc_goal = es->s_prealloc_blocks ? es->s_prealloc_blocks : EXT2_DEFAULT_PREALLOC_BLOCKS; *prealloc_block = next_block; /* Writer: end */ - for (k = 1; - k < prealloc_goal && (j + k) < EXT2_BLOCKS_PER_GROUP(sb); - k++, next_block++) { + for (bit = 1; + bit < prealloc_goal && (ret_block + bit) < EXT2_BLOCKS_PER_GROUP(sb); + bit++, next_block++) { if (DQUOT_PREALLOC_BLOCK(inode, 1)) break; /* Writer: ->i_prealloc* */ if (*prealloc_block + *prealloc_count != next_block || - ext2_set_bit (j + k, bh->b_data)) { + ext2_set_bit (ret_block + bit, bh->b_data)) { /* Writer: end */ DQUOT_FREE_BLOCK(inode, 1); break; @@ -569,16 +573,16 @@ got_block: */ gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - - (k - 1)); + (bit - 1)); es->s_free_blocks_count = cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - - (k - 1)); + (bit - 1)); ext2_debug ("Preallocated a further %lu bits.\n", - (k - 1)); + (bit - 1)); } #endif - j = tmp; + ret_block = target_block; mark_buffer_dirty(bh); if (sb->s_flags & MS_SYNCHRONOUS) { @@ -586,25 +590,25 @@ got_block: wait_on_buffer (bh); } - if (j >= le32_to_cpu(es->s_blocks_count)) { + if (ret_block >= le32_to_cpu(es->s_blocks_count)) { ext2_error (sb, "ext2_new_block", "block(%d) >= blocks count(%d) - " - "block_group = %d, es == %p ",j, - le32_to_cpu(es->s_blocks_count), i, es); + "block_group = %d, es == %p ",ret_block, + le32_to_cpu(es->s_blocks_count), group_no, es); goto out; } ext2_debug ("allocating block %d. " - "Goal hits %d of %d.\n", j, goal_hits, goal_attempts); + "Goal hits %d of %d.\n", ret_block, goal_hits, goal_attempts); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); - mark_buffer_dirty(bh2); + mark_buffer_dirty(gdp_bh); es->s_free_blocks_count = cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1); mark_buffer_dirty(sb->u.ext2_sb.s_sbh); sb->s_dirt = 1; unlock_super (sb); *err = 0; - return j; + return ret_block; io_error: *err = -EIO; --- linux-2.4-ext3merge/fs/ext2/ialloc.c.=K0015=.orig 2003-03-13 16:22:52.000000000 +0000 +++ linux-2.4-ext3merge/fs/ext2/ialloc.c 2003-03-13 16:23:13.000000000 +0000 @@ -406,24 +406,38 @@ static int find_group_other(struct super */ group = parent_group; desc = ext2_get_group_desc (sb, group, &bh); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) goto found; /* - * Use a quadratic hash to find a group with a - * free inode + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some + * free blocks. */ for (i = 1; i < ngroups; i <<= 1) { group += i; if (group >= ngroups) group -= ngroups; desc = ext2_get_group_desc (sb, group, &bh); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) goto found; } /* - * That failed: try linear search for a free inode + * That failed: try linear search for a free inode, even if that group + * has no free blocks. */ group = parent_group + 1; for (i = 2; i < ngroups; i++) { --- linux-2.4-ext3merge/fs/ext2/inode.c.=K0015=.orig 2003-03-13 16:22:52.000000000 +0000 +++ linux-2.4-ext3merge/fs/ext2/inode.c 2003-03-13 16:23:13.000000000 +0000 @@ -282,13 +282,22 @@ no_block: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * * Caller must make sure that @ind is valid and will stay that way. */ -static inline unsigned long ext2_find_near(struct inode *inode, Indirect *ind) +static unsigned long ext2_find_near(struct inode *inode, Indirect *ind) { + struct ext2_inode_info *ei = &inode->u.ext2_i; u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext2_i.i_data; u32 *p; + unsigned long bg_start; + unsigned long colour; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) @@ -303,9 +312,11 @@ static inline unsigned long ext2_find_ne * It is going to be refered from inode itself? OK, just put it into * the same cylinder group then. */ - return (inode->u.ext2_i.i_block_group * - EXT2_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block); + bg_start = (ei->i_block_group * EXT2_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT2_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; } /** --- linux-2.4-ext3merge/fs/ext3/ialloc.c.=K0015=.orig 2003-03-13 16:22:52.000000000 +0000 +++ linux-2.4-ext3merge/fs/ext3/ialloc.c 2003-03-13 16:23:13.000000000 +0000 @@ -451,24 +451,38 @@ static int find_group_other(struct super */ group = parent_group; desc = ext3_get_group_desc (sb, group, &bh); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) return group; /* - * Use a quadratic hash to find a group with a - * free inode + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. */ for (i = 1; i < ngroups; i <<= 1) { group += i; if (group >= ngroups) group -= ngroups; desc = ext3_get_group_desc (sb, group, &bh); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) return group; } /* - * That failed: try linear search for a free inode + * That failed: try linear search for a free inode, even if that group + * has no free blocks. */ group = parent_group + 1; for (i = 2; i < ngroups; i++) { --- linux-2.4-ext3merge/fs/ext3/inode.c.=K0015=.orig 2003-03-13 16:22:52.000000000 +0000 +++ linux-2.4-ext3merge/fs/ext3/inode.c 2003-03-13 16:23:13.000000000 +0000 @@ -441,13 +441,22 @@ no_block: * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * * Caller must make sure that @ind is valid and will stay that way. */ -static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) +static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) { + struct ext3_inode_info *ei = EXT3_I(inode); u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; u32 *p; + unsigned long bg_start; + unsigned long colour; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) @@ -462,9 +471,11 @@ static inline unsigned long ext3_find_ne * It is going to be refered from inode itself? OK, just put it into * the same cylinder group then. */ - return (inode->u.ext3_i.i_block_group * - EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; } /**