Skip to content

Commit

Permalink
Btrfs: process the delayed reference queue in clusters
Browse files Browse the repository at this point in the history
The delayed reference queue maintains pending operations that need to
be done to the extent allocation tree.  These are processed by
finding records in the tree that are not currently being processed one at
a time.

This is slow because it uses lots of time searching through the rbtree
and because it creates lock contention on the extent allocation tree
when lots of different procs are running delayed refs at the same time.

This commit changes things to grab a cluster of refs for processing,
using a cursor into the rbtree as the starting point of the next search.
This way we walk smoothly through the rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
  • Loading branch information
Chris Mason committed Mar 24, 2009
1 parent 1887be6 commit c3e69d5
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 113 deletions.
1 change: 1 addition & 0 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,7 @@ struct btrfs_fs_info {
struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;

struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
Expand Down
130 changes: 96 additions & 34 deletions fs/btrfs/delayed-ref.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
* ref if it was able to find one, or NULL if nothing was in that spot
*/
static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
u64 bytenr, u64 parent)
u64 bytenr, u64 parent,
struct btrfs_delayed_ref_node **last)
{
struct rb_node *n = root->rb_node;
struct btrfs_delayed_ref_node *entry;
Expand All @@ -102,6 +103,8 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
WARN_ON(!entry->in_tree);
if (last)
*last = entry;

cmp = comp_entry(entry, bytenr, parent);
if (cmp < 0)
Expand All @@ -114,45 +117,99 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
return NULL;
}

/*
* Locking on delayed refs is done by taking a lock on the head node,
* which has the (impossible) parent id of (u64)-1. Once a lock is held
* on the head node, you're allowed (and required) to process all the
* delayed refs for a given byte number in the tree.
*
* This will walk forward in the rbtree until it finds a head node it
* is able to lock. It might not lock the delayed ref you asked for,
* and so it will return the one it did lock in next_ret and return 0.
*
* If no locks are taken, next_ret is set to null and 1 is returned. This
* means there are no more unlocked head nodes in the rbtree.
*/
int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
struct btrfs_delayed_ref_head **next_ret)
int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_root *delayed_refs;

delayed_refs = &trans->transaction->delayed_refs;
assert_spin_locked(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
return 0;

atomic_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);

mutex_lock(&head->mutex);
spin_lock(&delayed_refs->lock);
if (!head->node.in_tree) {
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
return -EAGAIN;
}
btrfs_put_delayed_ref(&head->node);
return 0;
}

int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 start)
{
int count = 0;
struct btrfs_delayed_ref_root *delayed_refs;
struct rb_node *node;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *head;
int ret = 0;

while (1) {
delayed_refs = &trans->transaction->delayed_refs;
if (start == 0) {
node = rb_first(&delayed_refs->root);
} else {
ref = NULL;
tree_search(&delayed_refs->root, start, (u64)-1, &ref);
if (ref) {
struct btrfs_delayed_ref_node *tmp;

node = rb_prev(&ref->rb_node);
while (node) {
tmp = rb_entry(node,
struct btrfs_delayed_ref_node,
rb_node);
if (tmp->bytenr < start)
break;
ref = tmp;
node = rb_prev(&ref->rb_node);
}
node = &ref->rb_node;
} else
node = rb_first(&delayed_refs->root);
}
again:
while (node && count < 32) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
if (btrfs_delayed_ref_is_head(ref)) {
head = btrfs_delayed_node_to_head(ref);
if (mutex_trylock(&head->mutex)) {
*next_ret = head;
ret = 0;
if (list_empty(&head->cluster)) {
list_add_tail(&head->cluster, cluster);
delayed_refs->run_delayed_start =
head->node.bytenr;
count++;

WARN_ON(delayed_refs->num_heads_ready == 0);
delayed_refs->num_heads_ready--;
} else if (count) {
/* the goal of the clustering is to find extents
* that are likely to end up in the same extent
* leaf on disk. So, we don't want them spread
* all over the tree. Stop now if we've hit
* a head that was already in use
*/
break;
}
}
node = rb_next(&ref->rb_node);
if (!node) {
ret = 1;
*next_ret = NULL;
break;
}
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
node = rb_next(node);
}
return ret;
if (count) {
return 0;
} else if (start) {
/*
* we've gone to the end of the rbtree without finding any
* clusters. start from the beginning and try again
*/
start = 0;
node = rb_first(&delayed_refs->root);
goto again;
}
return 1;
}

/*
Expand All @@ -178,7 +235,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);

ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
if (ref) {
prev_node = rb_prev(&ref->rb_node);
if (!prev_node)
Expand Down Expand Up @@ -240,7 +297,7 @@ int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
}

spin_lock(&delayed_refs->lock);
ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
if (ref) {
head = btrfs_delayed_node_to_head(ref);
if (mutex_trylock(&head->mutex)) {
Expand Down Expand Up @@ -384,7 +441,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_ref *full_ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
int count_mod = 1;
int must_insert_reserved = 0;
Expand Down Expand Up @@ -428,6 +485,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
if (btrfs_delayed_ref_is_head(ref)) {
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
INIT_LIST_HEAD(&head_ref->cluster);
mutex_init(&head_ref->mutex);
} else {
full_ref = btrfs_delayed_node_to_ref(ref);
Expand All @@ -453,6 +511,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
*/
kfree(ref);
} else {
if (btrfs_delayed_ref_is_head(ref)) {
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
}
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
Expand Down Expand Up @@ -522,7 +584,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
struct btrfs_delayed_ref_root *delayed_refs;

delayed_refs = &trans->transaction->delayed_refs;
ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
if (ref)
return btrfs_delayed_node_to_head(ref);
return NULL;
Expand Down
17 changes: 14 additions & 3 deletions fs/btrfs/delayed-ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ struct btrfs_delayed_ref_head {
*/
struct mutex mutex;

struct list_head cluster;

/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
Expand Down Expand Up @@ -115,12 +117,20 @@ struct btrfs_delayed_ref_root {
*/
unsigned long num_entries;

/* total number of head nodes in tree */
unsigned long num_heads;

/* total number of head nodes ready for processing */
unsigned long num_heads_ready;

/*
* set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need
* to be run right away
*/
int flushing;

u64 run_delayed_start;
};

static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
Expand All @@ -140,9 +150,6 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
struct btrfs_delayed_ref_head **next_ret);
int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u32 *refs);
Expand All @@ -151,6 +158,10 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
u64 parent, u64 orig_ref_root, u64 ref_root,
u64 orig_ref_generation, u64 ref_generation,
u64 owner_objectid, int pin);
int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head);
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
Expand Down
17 changes: 0 additions & 17 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -1458,7 +1458,6 @@ static int transaction_kthread(void *arg)
struct btrfs_root *root = arg;
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
struct btrfs_fs_info *info = root->fs_info;
unsigned long now;
unsigned long delay;
int ret;
Expand All @@ -1481,24 +1480,8 @@ static int transaction_kthread(void *arg)

now = get_seconds();
if (now < cur->start_time || now - cur->start_time < 30) {
unsigned long num_delayed;
num_delayed = cur->delayed_refs.num_entries;
mutex_unlock(&root->fs_info->trans_mutex);
delay = HZ * 5;

/*
* we may have been woken up early to start
* processing the delayed extent ref updates
* If so, run some of them and then loop around again
* to see if we need to force a commit
*/
if (num_delayed > 64) {
mutex_unlock(&info->transaction_kthread_mutex);
trans = btrfs_start_transaction(root, 1);
btrfs_run_delayed_refs(trans, root, 256);
btrfs_end_transaction(trans, root);
continue;
}
goto sleep;
}
mutex_unlock(&root->fs_info->trans_mutex);
Expand Down
Loading

0 comments on commit c3e69d5

Please sign in to comment.