Skip to content

Commit

Permalink
ext4: New inode/block allocation algorithms for flex_bg filesystems
Browse files Browse the repository at this point in the history
The find_group_flex() inode allocator is now only used if the
filesystem is mounted using the "oldalloc" mount option.  It is
replaced with the original Orlov allocator that has been updated for
flex_bg filesystems (it should behave the same way if flex_bg is
disabled).  The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.

The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks.  This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:

debugfs:  stat /home/tytso/Maildir/cur
Inode: 1844562   Type: directory    Mode:  0700   Flags: 0x81000
Generation: 1132745781    Version: 0x00000000:0000ad71
User: 15806   Group: 15806   Size: 1060864
File ACL: 0    Directory ACL: 0
Links: 2   Blockcount: 2072
Fragment:  Address: 0    Number: 0    Size: 0
 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  • Loading branch information
Theodore Ts'o committed Mar 12, 2009
1 parent 2dc6b0d commit a491212
Show file tree
Hide file tree
Showing 6 changed files with 216 additions and 58 deletions.
6 changes: 6 additions & 0 deletions fs/ext4/ext4.h
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */

/*
* Minimum number of groups in a flexgroup before we separate out
* directories into the first block group of a flexgroup
*/
#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4

/*
* Structure of a directory entry
*/
Expand Down
3 changes: 3 additions & 0 deletions fs/ext4/ext4_i.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;

/* ialloc */
ext4_group_t i_last_alloc_group;

/* allocation reservation info for delalloc */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
Expand Down
25 changes: 24 additions & 1 deletion fs/ext4/extents.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
ext4_group_t block_group;
int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
int depth;

if (path) {
Expand All @@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}

/* OK. use inode's group */
bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
block_group = ei->i_block_group;
if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
/*
* If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
* block groups per flexgroup, reserve the first block
* group for directories and special files. Regular
* files will start at the second block group. This
* tends to speed up directory access and improves
* fsck times.
*/
block_group &= ~(flex_size-1);
if (S_ISREG(inode->i_mode))
block_group++;
}
bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

/*
* If we are doing delayed allocation, we don't need take
* colour into account.
*/
if (test_opt(inode->i_sb, DELALLOC))
return bg_start;

if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
Expand Down
215 changes: 159 additions & 56 deletions fs/ext4/ialloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,43 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
return 0;
}

struct orlov_stats {
__u32 free_inodes;
__u32 free_blocks;
__u32 used_dirs;
};

/*
* Helper function for Orlov's allocator; returns critical information
* for a particular block group or flex_bg. If flex_size is 1, then g
* is a block group number; otherwise it is flex_bg number.
*/
void get_orlov_stats(struct super_block *sb, ext4_group_t g,
int flex_size, struct orlov_stats *stats)
{
struct ext4_group_desc *desc;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
int i;

stats->free_inodes = 0;
stats->free_blocks = 0;
stats->used_dirs = 0;

g *= flex_size;

for (i = 0; i < flex_size; i++) {
if (g >= ngroups)
break;
desc = ext4_get_group_desc(sb, g++, NULL);
if (!desc)
continue;

stats->free_inodes += ext4_free_inodes_count(sb, desc);
stats->free_blocks += ext4_free_blks_count(sb, desc);
stats->used_dirs += ext4_used_dirs_count(sb, desc);
}
}

/*
* Orlov's allocator for directories.
*
Expand All @@ -425,35 +462,34 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
* it's already running too large debt (max_debt).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
*
* Debt is incremented each time we allocate a directory and decremented
* when we allocate an inode, within 0--255.
*/

#define INODE_COST 64
#define BLOCK_COST 256

static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4_group_t *group)
ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_group_t ngroups = sbi->s_groups_count;
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
ext4_fsblk_t blocks_per_dir;
unsigned int ndirs;
int max_debt, max_dirs, min_inodes;
int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
ext4_group_t i;
ext4_group_t i, grp, g;
struct ext4_group_desc *desc;
struct orlov_stats stats;
int flex_size = ext4_flex_bg_size(sbi);

if (flex_size > 1) {
ngroups = (ngroups + flex_size - 1) >>
sbi->s_log_groups_per_flex;
parent_group >>= sbi->s_log_groups_per_flex;
}

freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
Expand All @@ -462,71 +498,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
do_div(avefreeb, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);

if ((parent == sb->s_root->d_inode) ||
(EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
if (S_ISDIR(mode) &&
((parent == sb->s_root->d_inode) ||
(EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
int best_ndir = inodes_per_group;
ext4_group_t grp;
int ret = -1;

get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
if (!desc || !ext4_free_inodes_count(sb, desc))
g = (parent_group + i) % ngroups;
get_orlov_stats(sb, g, flex_size, &stats);
if (!stats.free_inodes)
continue;
if (ext4_used_dirs_count(sb, desc) >= best_ndir)
if (stats.used_dirs >= best_ndir)
continue;
if (ext4_free_inodes_count(sb, desc) < avefreei)
if (stats.free_inodes < avefreei)
continue;
if (ext4_free_blks_count(sb, desc) < avefreeb)
if (stats.free_blocks < avefreeb)
continue;
*group = grp;
grp = g;
ret = 0;
best_ndir = ext4_used_dirs_count(sb, desc);
best_ndir = stats.used_dirs;
}
if (ret)
goto fallback;
found_flex_bg:
if (flex_size == 1) {
*group = grp;
return 0;
}

/*
* We pack inodes at the beginning of the flexgroup's
* inode tables. Block allocation decisions will do
* something similar, although regular files will
* start at 2nd block group of the flexgroup. See
* ext4_ext_find_goal() and ext4_find_near().
*/
grp *= flex_size;
for (i = 0; i < flex_size; i++) {
if (grp+i >= sbi->s_groups_count)
break;
desc = ext4_get_group_desc(sb, grp+i, NULL);
if (desc && ext4_free_inodes_count(sb, desc)) {
*group = grp+i;
return 0;
}
}
if (ret == 0)
return ret;
goto fallback;
}

blocks_per_dir = ext4_blocks_count(es) - freeb;
do_div(blocks_per_dir, ndirs);

max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;

max_debt = EXT4_BLOCKS_PER_GROUP(sb);
max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
if (max_debt * INODE_COST > inodes_per_group)
max_debt = inodes_per_group / INODE_COST;
if (max_debt > 255)
max_debt = 255;
if (max_debt == 0)
max_debt = 1;
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;

/*
* Start looking in the flex group where we last allocated an
* inode for this parent directory
*/
if (EXT4_I(parent)->i_last_alloc_group != ~0) {
parent_group = EXT4_I(parent)->i_last_alloc_group;
if (flex_size > 1)
parent_group >>= sbi->s_log_groups_per_flex;
}

for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
if (ext4_used_dirs_count(sb, desc) >= max_dirs)
grp = (parent_group + i) % ngroups;
get_orlov_stats(sb, grp, flex_size, &stats);
if (stats.used_dirs >= max_dirs)
continue;
if (ext4_free_inodes_count(sb, desc) < min_inodes)
if (stats.free_inodes < min_inodes)
continue;
if (ext4_free_blks_count(sb, desc) < min_blocks)
if (stats.free_blocks < min_blocks)
continue;
return 0;
goto found_flex_bg;
}

fallback:
ngroups = sbi->s_groups_count;
avefreei = freei / ngroups;
parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
ext4_free_inodes_count(sb, desc) >= avefreei)
ext4_free_inodes_count(sb, desc) >= avefreei) {
*group = grp;
return 0;
}
}

if (avefreei) {
Expand All @@ -542,12 +604,51 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
}

static int find_group_other(struct super_block *sb, struct inode *parent,
ext4_group_t *group)
ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
struct ext4_group_desc *desc;
ext4_group_t i;
ext4_group_t i, last;
int flex_size = ext4_flex_bg_size(EXT4_SB(sb));

/*
* Try to place the inode is the same flex group as its
* parent. If we can't find space, use the Orlov algorithm to
* find another flex group, and store that information in the
* parent directory's inode information so that use that flex
* group for future allocations.
*/
if (flex_size > 1) {
int retry = 0;

try_again:
parent_group &= ~(flex_size-1);
last = parent_group + flex_size;
if (last > ngroups)
last = ngroups;
for (i = parent_group; i < last; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc && ext4_free_inodes_count(sb, desc)) {
*group = i;
return 0;
}
}
if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
retry = 1;
parent_group = EXT4_I(parent)->i_last_alloc_group;
goto try_again;
}
/*
* If this didn't work, use the Orlov search algorithm
* to find a new flex group; we pass in the mode to
* avoid the topdir algorithms.
*/
*group = parent_group + flex_size;
if (*group > ngroups)
*group = 0;
return find_group_orlov(sb, parent, group, mode);
}

/*
* Try to place the inode in its parent directory
Expand Down Expand Up @@ -716,10 +817,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;

if (sbi->s_log_groups_per_flex) {
if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
ret2 = find_group_other(sb, dir, &group);
ret2 = find_group_other(sb, dir, &group, mode);
if (ret2 == 0 && once)
once = 0;
printk(KERN_NOTICE "ext4: find_group_flex "
Expand All @@ -733,11 +834,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
ret2 = find_group_orlov(sb, dir, &group);
ret2 = find_group_orlov(sb, dir, &group, mode);
} else
ret2 = find_group_other(sb, dir, &group);
ret2 = find_group_other(sb, dir, &group, mode);

got_group:
EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;
Expand Down Expand Up @@ -894,6 +996,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;

ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
Expand Down
Loading

0 comments on commit a491212

Please sign in to comment.