Skip to content

Commit

Permalink
ext4: Don't allow lg prealloc list to be grow large.
Browse files Browse the repository at this point in the history
Currently, the locality group prealloc list is freed only when there
is a block allocation failure. This can result in large number of
entries in the preallocation list making ext4_mb_use_preallocated()
expensive.

To fix this, we convert the locality group prealloc list to a hash
list. The hash index is the order of number of blocks in the prealloc
space with a max order of 9. When adding prealloc space to the list we
make sure total entries for each order does not exceed 8. If it is
more than 8 we discard few entries and make sure the we have only <= 5
entries.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  • Loading branch information
Aneesh Kumar K.V authored and Theodore Ts'o committed Jul 23, 2008
1 parent 1320cbc commit 6be2ded
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 30 deletions.
213 changes: 185 additions & 28 deletions fs/ext4/mballoc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2480,7 +2480,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
int ext4_mb_init(struct super_block *sb, int needs_recovery)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i;
unsigned i, j;
unsigned offset;
unsigned max;
int ret;
Expand Down Expand Up @@ -2552,7 +2552,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
struct ext4_locality_group *lg;
lg = &sbi->s_locality_groups[i];
mutex_init(&lg->lg_mutex);
INIT_LIST_HEAD(&lg->lg_prealloc_list);
for (j = 0; j < PREALLOC_TB_SIZE; j++)
INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
spin_lock_init(&lg->lg_prealloc_lock);
}

Expand Down Expand Up @@ -3263,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
unsigned int len = ac->ac_o_ex.fe_len;

ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
&ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
Expand All @@ -3285,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
struct ext4_prealloc_space *pa;
Expand Down Expand Up @@ -3325,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
lg = ac->ac_lg;
if (lg == NULL)
return 0;

rcu_read_lock();
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
atomic_inc(&pa->pa_count);
ext4_mb_use_group_pa(ac, pa);
order = fls(ac->ac_o_ex.fe_len) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;

for (i = order; i < PREALLOC_TB_SIZE; i++) {
rcu_read_lock();
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
pa_inode_list) {
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 &&
pa->pa_free >= ac->ac_o_ex.fe_len) {
atomic_inc(&pa->pa_count);
ext4_mb_use_group_pa(ac, pa);
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 20;
rcu_read_unlock();
return 1;
}
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 20;
rcu_read_unlock();
return 1;
}
spin_unlock(&pa->pa_lock);
rcu_read_unlock();
}
rcu_read_unlock();

return 0;
}

Expand Down Expand Up @@ -3563,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
pa->pa_deleted = 0;
pa->pa_linear = 1;

Expand All @@ -3583,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

spin_lock(pa->pa_obj_lock);
list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
spin_unlock(pa->pa_obj_lock);

/*
* We will later add the new pa to the right bucket
* after updating the pa_free in ext4_mb_release_context
*/
return 0;
}

Expand Down Expand Up @@ -4123,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,

}

static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct ext4_locality_group *lg,
int order, int total_entries)
{
ext4_group_t group = 0;
struct ext4_buddy e4b;
struct list_head discard_list;
struct ext4_prealloc_space *pa, *tmp;
struct ext4_allocation_context *ac;

mb_debug("discard locality group preallocation\n");

INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);

spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/*
* This is the pa that we just used
* for block allocation. So don't
* free that
*/
spin_unlock(&pa->pa_lock);
continue;
}
if (pa->pa_deleted) {
spin_unlock(&pa->pa_lock);
continue;
}
/* only lg prealloc space */
BUG_ON(!pa->pa_linear);

/* seems this one can be freed ... */
pa->pa_deleted = 1;
spin_unlock(&pa->pa_lock);

list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &discard_list);

total_entries--;
if (total_entries <= 5) {
/*
* we want to keep only 5 entries
* allowing it to grow to 8. This
* mak sure we don't call discard
* soon for this list.
*/
break;
}
}
spin_unlock(&lg->lg_prealloc_lock);

list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {

ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, __func__, "Error in loading buddy "
"information for %lu\n", group);
continue;
}
ext4_lock_group(sb, group);
list_del(&pa->pa_group_list);
ext4_mb_release_group_pa(&e4b, pa, ac);
ext4_unlock_group(sb, group);

ext4_mb_release_desc(&e4b);
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
}

/*
* We have incremented pa_count. So it cannot be freed at this
* point. Also we hold lg_mutex. So no parallel allocation is
* possible from this lg. That means pa_free cannot be updated.
*
* A parallel ext4_mb_discard_group_preallocations is possible.
* which can cause the lg_prealloc_list to be updated.
*/

static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
int order, added = 0, lg_prealloc_count = 1;
struct super_block *sb = ac->ac_sb;
struct ext4_locality_group *lg = ac->ac_lg;
struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;

order = fls(pa->pa_free) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
rcu_read_lock();
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&pa->pa_lock);
continue;
}
if (!added && pa->pa_free < tmp_pa->pa_free) {
/* Add to the tail of the previous entry */
list_add_tail_rcu(&pa->pa_inode_list,
&tmp_pa->pa_inode_list);
added = 1;
/*
* we want to count the total
* number of entries in the list
*/
}
spin_unlock(&tmp_pa->pa_lock);
lg_prealloc_count++;
}
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
rcu_read_unlock();

/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
order, lg_prealloc_count);
return;
}
return ;
}

/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
if (ac->ac_pa) {
if (ac->ac_pa->pa_linear) {
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
if (pa->pa_linear) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&ac->ac_pa->pa_lock);
ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&ac->ac_pa->pa_lock);
spin_lock(&pa->pa_lock);
pa->pa_pstart += ac->ac_b_ex.fe_len;
pa->pa_lstart += ac->ac_b_ex.fe_len;
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
/*
* We want to add the pa to the right bucket.
* Remove it from the list and while adding
* make sure the list to which we are adding
* doesn't grow big.
*/
if (likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
ext4_mb_add_n_trim(ac);
}
}
ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
page_cache_release(ac->ac_bitmap_page);
Expand Down
10 changes: 8 additions & 2 deletions fs/ext4/mballoc.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,17 @@ struct ext4_free_extent {
* Locality group:
* we try to group all related changes together
* so that writeback can flush/allocate them together as well
* Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
* (512). We store prealloc space into the hash based on the pa_free blocks
* order value.ie, fls(pa_free)-1;
*/
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
/* for allocator */
struct mutex lg_mutex; /* to serialize allocates */
struct list_head lg_prealloc_list;/* list of preallocations */
/* to serialize allocates */
struct mutex lg_mutex;
/* list of preallocations */
struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
spinlock_t lg_prealloc_lock;
};

Expand Down

0 comments on commit 6be2ded

Please sign in to comment.