From 60b2788f0f44379558c111e6e2e22ef3e40d2141 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:31 -0400 Subject: [PATCH] --- yaml --- r: 297857 b: refs/heads/master c: 8c3429300181be44b30f9f017d53dc717da56caa h: refs/heads/master i: 297855: 8d5c187d4b6eb3afdfb99e7d82cb48ffbeee408e v: v3 --- [refs] | 2 +- trunk/fs/btrfs/ctree.c | 134 +--- trunk/fs/btrfs/ctree.h | 53 +- trunk/fs/btrfs/disk-io.c | 376 +++++---- trunk/fs/btrfs/extent-tree.c | 114 ++- trunk/fs/btrfs/extent_io.c | 803 +++++-------------- trunk/fs/btrfs/extent_io.h | 49 +- trunk/fs/btrfs/inode-item.c | 1 - trunk/fs/btrfs/inode.c | 11 +- trunk/fs/btrfs/reada.c | 10 +- trunk/fs/btrfs/scrub.c | 1380 +++++++++------------------------ trunk/fs/btrfs/struct-funcs.c | 53 +- trunk/fs/btrfs/super.c | 50 +- trunk/fs/btrfs/volumes.c | 2 +- 14 files changed, 1022 insertions(+), 2016 deletions(-) diff --git a/[refs] b/[refs] index 25bf382e522a..532dfacc26cb 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: b5d67f64f9bc656970dacba245410f0faedad18e +refs/heads/master: 8c3429300181be44b30f9f017d53dc717da56caa diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c index 270655da11d1..0639a555e16e 100644 --- a/trunk/fs/btrfs/ctree.c +++ b/trunk/fs/btrfs/ctree.c @@ -156,23 +156,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - while (1) { - rcu_read_lock(); - eb = rcu_dereference(root->node); - - /* - * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do - * the inc_not_zero dance and if it doesn't work then - * synchronize_rcu and try again. - */ - if (atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - break; - } - rcu_read_unlock(); - synchronize_rcu(); - } + rcu_read_lock(); + eb = rcu_dereference(root->node); + extent_buffer_get(eb); + rcu_read_unlock(); return eb; } @@ -517,7 +504,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer_stale(buf); + free_extent_buffer(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; @@ -972,7 +959,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ - free_extent_buffer_stale(mid); + free_extent_buffer(mid); return 0; } if (btrfs_header_nritems(mid) > @@ -1029,7 +1016,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1, 0); - free_extent_buffer_stale(right); + free_extent_buffer(right); right = NULL; } else { struct btrfs_disk_key right_key; @@ -1069,7 +1056,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - free_extent_buffer_stale(mid); + free_extent_buffer(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ @@ -1395,8 +1382,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock, int min_write_lock_level, - int *write_lock_level) + int lowest_unlock) { int i; int skip_level = level; @@ -1428,11 +1414,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level, if (i >= lowest_unlock && i > skip_level && path->locks[i]) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; - if (write_lock_level && - i > min_write_lock_level && - i <= *write_lock_level) { - *write_lock_level = i - 1; - } } } } @@ -1656,7 +1637,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; - int min_write_lock_level; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1684,8 +1664,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; - min_write_lock_level = write_lock_level; - again: /* * we try very hard to do read locks on the root @@ -1817,8 +1795,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root goto again; } - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); + unlock_up(p, level, lowest_unlock); if (level == lowest_level) { if (dec) @@ -1880,8 +1857,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root } } if (!p->search_for_split) - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); + unlock_up(p, level, lowest_unlock); goto done; } } @@ -2344,7 +2320,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; - struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -2356,8 +2331,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 data_end; u32 this_item_size; - btrfs_init_map_token(&token); - if (empty) nr = 0; else @@ -2435,8 +2408,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space -= btrfs_token_item_size(right, item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); + push_space -= btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); } left_nritems -= push_items; @@ -2567,9 +2540,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, int wret; u32 this_item_size; u32 old_left_item_size; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); if (empty) nr = min(right_nritems, max_slot); @@ -2630,10 +2600,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, item = btrfs_item_nr(left, i); - ioff = btrfs_token_item_offset(left, item, &token); - btrfs_set_token_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), - &token); + ioff = btrfs_item_offset(left, item); + btrfs_set_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2663,9 +2632,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space = push_space - btrfs_token_item_size(right, - item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); + push_space = push_space - btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); } btrfs_mark_buffer_dirty(left); @@ -2786,9 +2754,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, int ret = 0; int wret; struct btrfs_disk_key disk_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -2810,9 +2775,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - ioff = btrfs_token_item_offset(right, item, &token); - btrfs_set_token_item_offset(right, item, - ioff + rt_data_off, &token); + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -3307,9 +3271,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3336,9 +3297,8 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + size_diff, &token); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + size_diff); } /* shift the data */ @@ -3408,9 +3368,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -3440,9 +3397,8 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - data_size, &token); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); } /* shift the data */ @@ -3485,9 +3441,6 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, unsigned int data_end; struct btrfs_disk_key disk_key; struct btrfs_key found_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); for (i = 0; i < nr; i++) { if (total_size + data_size[i] + sizeof(struct btrfs_item) > @@ -3553,9 +3506,8 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3582,10 +3534,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); + btrfs_set_item_size(leaf, item, data_size[i]); } btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); @@ -3624,9 +3575,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, int ret; struct extent_buffer *leaf; int slot; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3658,9 +3606,8 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3679,10 +3626,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); + btrfs_set_item_size(leaf, item, data_size[i]); } btrfs_set_header_nritems(leaf, nritems + nr); @@ -3835,9 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); - extent_buffer_get(leaf); btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); - free_extent_buffer_stale(leaf); return 0; } /* @@ -3855,9 +3799,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -3879,9 +3820,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + dsize, &token); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + dsize); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -4119,7 +4059,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1, 0, NULL); + unlock_up(path, level, 1); goto out; } btrfs_set_path_blocking(path); @@ -4130,7 +4070,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; - unlock_up(path, level, 1, 0, NULL); + unlock_up(path, level, 1); btrfs_clear_path_blocking(path, NULL, 0); } out: @@ -4366,7 +4306,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) } ret = 0; done: - unlock_up(path, 0, 1, 0, NULL); + unlock_up(path, 0, 1); path->leave_spinning = old_spinning; if (!old_spinning) btrfs_set_path_blocking(path); diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h index f7da8a8d13c1..a97a67089755 100644 --- a/trunk/fs/btrfs/ctree.h +++ b/trunk/fs/btrfs/ctree.h @@ -48,8 +48,6 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" -#define BTRFS_MAX_MIRRORS 2 - #define BTRFS_MAX_LEVEL 8 #define BTRFS_COMPAT_EXTENT_TREE_V0 @@ -139,12 +137,6 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 -/* - * the max metadata block size. This limit is somewhat artificial, - * but the memmove costs go through the roof for larger blocks. - */ -#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 - /* * we can actually store much bigger names, but lets not confuse the rest * of linux @@ -469,19 +461,6 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) - -/* - * older kernels tried to do bigger metadata blocks, but the - * code was pretty buggy. Lets not let them try anymore. - */ -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -489,7 +468,6 @@ struct btrfs_super_block { (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* @@ -1525,6 +1503,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) +#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1548,17 +1527,6 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - -static inline void btrfs_init_map_token (struct btrfs_map_token *token) -{ - memset(token, 0, sizeof(*token)); -} - /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1582,22 +1550,20 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) #ifndef BTRFS_SETGET_FUNCS #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #endif #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = page_address(eb->pages[0]); \ + type *p = page_address(eb->first_page); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->pages[0]); \ + type *p = page_address(eb->first_page); \ p->member = cpu_to_le##bits(val); \ } @@ -2501,7 +2467,8 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data); + u64 search_end, struct btrfs_key *ins, + u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3004,6 +2971,16 @@ do { \ __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ } while (0) +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + struct btrfs_fs_info *_i = (fs_info); \ + __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ + BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ +} while (0) + /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c index 6107b6958413..534266fe505f 100644 --- a/trunk/fs/btrfs/disk-io.c +++ b/trunk/fs/btrfs/disk-io.c @@ -333,7 +333,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state, GFP_NOFS); - if (extent_buffer_uptodate(eb) && + if (extent_buffer_uptodate(io_tree, eb, cached_state) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; @@ -344,7 +344,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, (unsigned long long)parent_transid, (unsigned long long)btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(eb); + clear_extent_buffer_uptodate(io_tree, eb, &cached_state); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); @@ -360,11 +360,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, u64 start, u64 parent_transid) { struct extent_io_tree *io_tree; - int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; - int failed_mirror = 0; clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; @@ -372,8 +370,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) - break; + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; /* * This buffer's crc is fine, but its contents are corrupted, so @@ -381,31 +380,18 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * any less wrong. */ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) - break; - - if (!failed_mirror) { - failed = 1; - printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror); - failed_mirror = eb->failed_mirror; - } + return ret; num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) - break; + return ret; mirror_num++; - if (mirror_num == failed_mirror) - mirror_num++; - if (mirror_num > num_copies) - break; + return ret; } - - if (failed && !ret) - repair_eb_io_failure(root, eb, failed_mirror); - - return ret; + return -EIO; } /* @@ -418,28 +404,50 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; + unsigned long len; struct extent_buffer *eb; + int ret; tree = &BTRFS_I(page->mapping->host)->io_tree; - eb = (struct extent_buffer *)page->private; - if (page != eb->pages[0]) - return 0; + if (page->private == EXTENT_PAGE_PRIVATE) { + WARN_ON(1); + goto out; + } + if (!page->private) { + WARN_ON(1); + goto out; + } + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) { + WARN_ON(1); + goto out; + } + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); + WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); - return 0; + goto err; } - if (eb->pages[0] != page) { + if (eb->first_page != page) { WARN_ON(1); - return 0; + goto err; } if (!PageUptodate(page)) { WARN_ON(1); - return 0; + goto err; } csum_tree_block(root, eb, 0); +err: + free_extent_buffer(eb); +out: return 0; } @@ -529,74 +537,34 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, - struct page *page, int max_walk) -{ - struct extent_buffer *eb; - u64 start = page_offset(page); - u64 target = start; - u64 min_start; - - if (start < max_walk) - min_start = 0; - else - min_start = start - max_walk; - - while (start >= min_start) { - eb = find_extent_buffer(tree, start, 0); - if (eb) { - /* - * we found an extent buffer and it contains our page - * horray! - */ - if (eb->start <= target && - eb->start + eb->len > target) - return eb; - - /* we found an extent buffer that wasn't for us */ - free_extent_buffer(eb); - return NULL; - } - if (start == 0) - break; - start -= PAGE_CACHE_SIZE; - } - return NULL; -} - static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { struct extent_io_tree *tree; u64 found_start; int found_level; + unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; int ret = 0; - int reads_done; + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; if (!page->private) goto out; - tree = &BTRFS_I(page->mapping->host)->io_tree; - eb = (struct extent_buffer *)page->private; + len = page->private >> 2; + WARN_ON(len == 0); - /* the pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - extent_buffer_get(eb); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) { ret = -EIO; - goto err; + goto out; } found_start = btrfs_header_bytenr(eb); - if (found_start != eb->start) { + if (found_start != start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", (unsigned long long)found_start, @@ -604,6 +572,13 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } + if (eb->first_page != page) { + printk(KERN_INFO "btrfs bad first page %lu %lu\n", + eb->first_page->index, page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } if (check_tree_block_fsid(root, eb)) { printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", (unsigned long long)eb->start); @@ -631,31 +606,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; } - if (!ret) - set_extent_buffer_uptodate(eb); + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; err: if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, ret); } - if (ret) - clear_extent_buffer_uptodate(eb); free_extent_buffer(eb); out: return ret; } -static int btree_io_failed_hook(struct page *page, int failed_mirror) +static int btree_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int mirror_num, struct extent_state *state) { + struct extent_io_tree *tree; + unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = failed_mirror; - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) + goto out; + + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, -EIO); + } + free_extent_buffer(eb); + +out: return -EIO; /* we fixed nothing */ } @@ -855,15 +847,15 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, { int ret; - if (!(rw & REQ_WRITE)) { + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + BUG_ON(ret); + if (!(rw & REQ_WRITE)) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); - BUG_ON(ret); return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } @@ -901,6 +893,34 @@ static int btree_migratepage(struct address_space *mapping, } #endif +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct extent_buffer *eb; + int was_dirty; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (!(current->flags & PF_MEMALLOC)) { + return extent_write_full_page(tree, page, + btree_get_extent, wbc); + } + + redirty_page_for_writepage(wbc, page); + eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); + WARN_ON(!eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; + spin_unlock(&root->fs_info->delalloc_lock); + } + free_extent_buffer(eb); + + unlock_page(page); + return 0; +} static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -920,7 +940,7 @@ static int btree_writepages(struct address_space *mapping, if (num_dirty < thresh) return 0; } - return btree_write_cache_pages(mapping, wbc); + return extent_writepages(tree, mapping, btree_get_extent, wbc); } static int btree_readpage(struct file *file, struct page *page) @@ -932,8 +952,16 @@ static int btree_readpage(struct file *file, struct page *page) static int btree_releasepage(struct page *page, gfp_t gfp_flags) { + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + if (PageWriteback(page) || PageDirty(page)) return 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -941,7 +969,18 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) */ gfp_flags &= ~GFP_SLAB_BUG_MASK; - return try_release_extent_buffer(page, gfp_flags); + ret = try_release_extent_state(map, tree, page, gfp_flags); + if (!ret) + return 0; + + ret = try_release_extent_buffer(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + + return ret; } static void btree_invalidatepage(struct page *page, unsigned long offset) @@ -959,28 +998,15 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -static int btree_set_page_dirty(struct page *page) -{ - struct extent_buffer *eb; - - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); - return __set_page_dirty_nobuffers(page); -} - static const struct address_space_operations btree_aops = { .readpage = btree_readpage, + .writepage = btree_writepage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif - .set_page_dirty = btree_set_page_dirty, }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1023,7 +1049,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { free_extent_buffer(buf); return -EIO; - } else if (extent_buffer_uptodate(buf)) { + } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { *eb = buf; } else { free_extent_buffer(buf); @@ -1048,20 +1074,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct extent_buffer *eb; eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); + bytenr, blocksize, NULL); return eb; } int btrfs_write_tree_block(struct extent_buffer *buf) { - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->pages[0]->mapping, + return filemap_fdatawait_range(buf->first_page->mapping, buf->start, buf->start + buf->len - 1); } @@ -1076,6 +1102,9 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return buf; } @@ -1083,6 +1112,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1098,7 +1128,8 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(buf); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); } return 0; } @@ -1482,6 +1513,41 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return 0; } +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + return ret; +} + /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -1497,6 +1563,17 @@ static void end_workqueue_fn(struct btrfs_work *work) bio = end_io_wq->bio; fs_info = end_io_wq->info; + /* metadata bio reads are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && + !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + return; + } error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; @@ -1965,7 +2042,6 @@ int open_ctree(struct super_block *sb, RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping); - BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; @@ -2059,38 +2135,10 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - if (btrfs_super_leafsize(disk_super) != - btrfs_super_nodesize(disk_super)) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksizes don't match. node %d leaf %d\n", - btrfs_super_nodesize(disk_super), - btrfs_super_leafsize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksize (%d) was too large\n", - btrfs_super_leafsize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; - - /* - * flag our filesystem as having big metadata blocks if - * they are bigger than the page size - */ - if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } - btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -3074,9 +3122,10 @@ int close_ctree(struct btrfs_root *root) int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) { int ret; - struct inode *btree_inode = buf->pages[0]->mapping->host; + struct inode *btree_inode = buf->first_page->mapping->host; - ret = extent_buffer_uptodate(buf); + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, + NULL); if (!ret) return ret; @@ -3087,13 +3136,16 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - return set_extent_buffer_uptodate(buf); + struct inode *btree_inode = buf->first_page->mapping->host; + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, + buf); } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; u64 transid = btrfs_header_generation(buf); + struct inode *btree_inode = root->fs_info->btree_inode; int was_dirty; btrfs_assert_tree_locked(buf); @@ -3105,7 +3157,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - was_dirty = set_extent_buffer_dirty(buf); + was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); root->fs_info->dirty_metadata_bytes += buf->len; @@ -3159,8 +3212,12 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); + return ret; } static int btree_lock_page_hook(struct page *page, void *data, @@ -3168,21 +3225,17 @@ static int btree_lock_page_hook(struct page *page, void *data, { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; + unsigned long len; + u64 bytenr = page_offset(page); - /* - * We culled this eb but the page is still hanging out on the mapping, - * carry on. - */ - if (!PagePrivate(page)) + if (page->private == EXTENT_PAGE_PRIVATE) goto out; - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - goto out; - } - if (page != eb->pages[0]) + len = page->private >> 2; + eb = find_extent_buffer(io_tree, bytenr, len); + if (!eb) goto out; if (!btrfs_try_tree_write_lock(eb)) { @@ -3201,6 +3254,7 @@ static int btree_lock_page_hook(struct page *page, void *data, } btrfs_tree_unlock(eb); + free_extent_buffer(eb); out: if (!trylock_page(page)) { flush_fn(data); diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c index 1b831ac4c079..37e0a800d34e 100644 --- a/trunk/fs/btrfs/extent-tree.c +++ b/trunk/fs/btrfs/extent-tree.c @@ -5018,6 +5018,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); + } else { + invalidate_mapping_pages(info->btree_inode->i_mapping, + bytenr >> PAGE_CACHE_SHIFT, + (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0); @@ -5256,10 +5260,11 @@ static int get_block_group_index(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHING_NOWAIT = 0, - LOOP_CACHING_WAIT = 1, - LOOP_ALLOC_CHUNK = 2, - LOOP_NO_EMPTY_SIZE = 3, + LOOP_FIND_IDEAL = 0, + LOOP_CACHING_NOWAIT = 1, + LOOP_CACHING_WAIT = 2, + LOOP_ALLOC_CHUNK = 3, + LOOP_NO_EMPTY_SIZE = 4, }; /* @@ -5273,6 +5278,7 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, + u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, u64 data) { @@ -5281,7 +5287,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; struct btrfs_block_group_cache *used_block_group; - u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; @@ -5295,6 +5300,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; + u64 ideal_cache_percent = 0; + u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -5344,6 +5351,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { +ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); used_block_group = block_group; @@ -5355,7 +5363,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - block_group->cached != BTRFS_CACHE_NO) { + (block_group->cached != BTRFS_CACHE_NO || + search_start == ideal_cache_offset)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -5409,12 +5418,44 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { + u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, - orig_root, 0); - BUG_ON(ret); + orig_root, 1); + if (block_group->cached == BTRFS_CACHE_FINISHED) + goto alloc; + + free_percent = btrfs_block_group_used(&block_group->item); + free_percent *= 100; + free_percent = div64_u64(free_percent, + block_group->key.offset); + free_percent = 100 - free_percent; + if (free_percent > ideal_cache_percent && + likely(!block_group->ro)) { + ideal_cache_offset = block_group->key.objectid; + ideal_cache_percent = free_percent; + } + + /* + * The caching workers are limited to 2 threads, so we + * can queue as much work as we care to. + */ + if (loop > LOOP_FIND_IDEAL) { + ret = cache_block_group(block_group, trans, + orig_root, 0); + BUG_ON(ret); + } + + /* + * If loop is set for cached only, try the next block + * group. + */ + if (loop == LOOP_FIND_IDEAL) + goto loop; } +alloc: if (unlikely(block_group->ro)) goto loop; @@ -5565,6 +5606,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, } checks: search_start = stripe_align(root, offset); + /* move on to the next group */ + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(used_block_group, offset, num_bytes); + goto loop; + } /* move on to the next group */ if (search_start + num_bytes > @@ -5615,7 +5661,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; - /* + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for + * for them to make caching progress. Also + * determine the best possible bg to cache * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -5625,7 +5673,45 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { + found_uncached_bg = false; + loop++; + if (!ideal_cache_percent) + goto search; + + /* + * 1 of the following 2 things have happened so far + * + * 1) We found an ideal block group for caching that + * is mostly full and will cache quickly, so we might + * as well wait for it. + * + * 2) We searched for cached only and we didn't find + * anything, and we didn't start any caching kthreads + * either, so chances are we will loop through and + * start a couple caching kthreads, and then come back + * around and just wait for them. This will be slower + * because we will have 2 caching kthreads reading at + * the same time when we could have just started one + * and waited for it to get far enough to give us an + * allocation, so go ahead and go to the wait caching + * loop. + */ + loop = LOOP_CACHING_WAIT; + search_start = ideal_cache_offset; + ideal_cache_percent = 0; + goto ideal_cache; + } else if (loop == LOOP_FIND_IDEAL) { + /* + * Didn't find a uncached bg, wait on anything we find + * next. + */ + loop = LOOP_CACHING_WAIT; + goto search; + } + loop++; + if (loop == LOOP_ALLOC_CHUNK) { if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + @@ -5712,10 +5798,12 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data) + u64 search_end, struct btrfs_key *ins, + u64 data) { bool final_tried = false; int ret; + u64 search_start = 0; data = btrfs_get_alloc_profile(root, data); again: @@ -5730,7 +5818,8 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - hint_byte, ins, data); + search_start, search_end, hint_byte, + ins, data); if (ret == -ENOSPC) { if (!final_tried) { @@ -6018,7 +6107,6 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); - clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); @@ -6126,7 +6214,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, &ins, 0); + empty_size, hint, (u64)-1, &ins, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c index 49a368593a16..a55fbe6252de 100644 --- a/trunk/fs/btrfs/extent_io.c +++ b/trunk/fs/btrfs/extent_io.c @@ -19,7 +19,6 @@ #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" -#include "locking.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -54,8 +53,6 @@ struct extent_page_data { unsigned int sync_io:1; }; -static noinline void flush_write_bio(void *data); - int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("extent_state", @@ -1915,26 +1912,6 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return 0; } -int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, - int mirror_num) -{ - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - u64 start = eb->start; - unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); - int ret; - - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, - start, p, mirror_num); - if (ret) - break; - start += PAGE_CACHE_SIZE; - } - - return ret; -} - /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2281,7 +2258,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; - int failed_mirror; int ret; if (err) @@ -2328,16 +2304,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) else clean_io_failure(start, page); } - - if (!uptodate) + if (!uptodate) { + int failed_mirror; failed_mirror = (int)(unsigned long)bio->bi_bdev; - - if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); - if (!ret && !err && - test_bit(BIO_UPTODATE, &bio->bi_flags)) - uptodate = 1; - } else if (!uptodate) { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2351,6 +2320,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, page, start, end, failed_mirror, NULL); if (ret == 0) { +error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2358,9 +2328,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) uncache_state(&cached); continue; } + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, state); + if (ret == 0) + goto error_handled; + } } - if (uptodate && tree->track_uptodate) { + if (uptodate) { set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } @@ -2496,24 +2473,19 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +void set_page_extent_mapped(struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); page_cache_get(page); - set_page_private(page, (unsigned long)eb); - } else { - WARN_ON(page->private != (unsigned long)eb); + set_page_private(page, EXTENT_PAGE_PRIVATE); } } -void set_page_extent_mapped(struct page *page) +static void set_page_extent_head(struct page *page, unsigned long len) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + WARN_ON(!PagePrivate(page)); + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } /* @@ -2996,275 +2968,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, return 0; } -static int eb_wait(void *word) -{ - io_schedule(); - return 0; -} - -static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) -{ - wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, - TASK_UNINTERRUPTIBLE); -} - -static int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct extent_page_data *epd) -{ - unsigned long i, num_pages; - int flush = 0; - int ret = 0; - - if (!btrfs_try_tree_write_lock(eb)) { - flush = 1; - flush_write_bio(epd); - btrfs_tree_lock(eb); - } - - if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { - btrfs_tree_unlock(eb); - if (!epd->sync_io) - return 0; - if (!flush) { - flush_write_bio(epd); - flush = 1; - } - while (1) { - wait_on_extent_buffer_writeback(eb); - btrfs_tree_lock(eb); - if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) - break; - btrfs_tree_unlock(eb); - } - } - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_lock(&fs_info->delalloc_lock); - if (fs_info->dirty_metadata_bytes >= eb->len) - fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&fs_info->delalloc_lock); - ret = 1; - } - - btrfs_tree_unlock(eb); - - if (!ret) - return ret; - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - - if (!trylock_page(p)) { - if (!flush) { - flush_write_bio(epd); - flush = 1; - } - lock_page(p); - } - } - - return ret; -} - -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - -static void end_bio_extent_buffer_writepage(struct bio *bio, int err) -{ - int uptodate = err == 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_buffer *eb; - int done; - - do { - struct page *page = bvec->bv_page; - - bvec--; - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - done = atomic_dec_and_test(&eb->io_pages); - - if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - ClearPageUptodate(page); - SetPageError(page); - } - - end_page_writeback(page); - - if (!done) - continue; - - end_extent_buffer_writeback(eb); - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); - -} - -static int write_one_eb(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct writeback_control *wbc, - struct extent_page_data *epd) -{ - struct block_device *bdev = fs_info->fs_devices->latest_bdev; - u64 offset = eb->start; - unsigned long i, num_pages; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE); - int ret; - - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); - atomic_set(&eb->io_pages, num_pages); - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - - clear_page_dirty_for_io(p); - set_page_writeback(p); - ret = submit_extent_page(rw, eb->tree, p, offset >> 9, - PAGE_CACHE_SIZE, 0, bdev, &epd->bio, - -1, end_bio_extent_buffer_writepage, - 0, 0, 0); - if (ret) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - SetPageError(p); - if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) - end_extent_buffer_writeback(eb); - ret = -EIO; - break; - } - offset += PAGE_CACHE_SIZE; - update_nr_written(p, wbc, 1); - unlock_page(p); - } - - if (unlikely(ret)) { - for (; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - unlock_page(p); - } - } - - return ret; -} - -int btree_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; - struct extent_buffer *eb, *prev_eb = NULL; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - int ret = 0; - int done = 0; - int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t index; - pgoff_t end; /* Inclusive */ - int scanned = 0; - int tag; - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - scanned = 1; - } - if (wbc->sync_mode == WB_SYNC_ALL) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; -retry: - if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); - while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; - - scanned = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (!PagePrivate(page)) - continue; - - if (!wbc->range_cyclic && page->index > end) { - done = 1; - break; - } - - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - continue; - } - - if (eb == prev_eb) - continue; - - if (!atomic_inc_not_zero(&eb->refs)) { - WARN_ON(1); - continue; - } - - prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, fs_info, &epd); - if (!ret) { - free_extent_buffer(eb); - continue; - } - - ret = write_one_eb(eb, fs_info, wbc, &epd); - if (ret) { - done = 1; - free_extent_buffer(eb); - break; - } - free_extent_buffer(eb); - - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; - } - pagevec_release(&pvec); - cond_resched(); - } - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = 1; - index = 0; - goto retry; - } - flush_write_bio(&epd); - return ret; -} - /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -3845,7 +3548,26 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { - return eb->pages[i]; + struct page *p; + struct address_space *mapping; + + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + mapping = eb->first_page->mapping; + if (!mapping) + return NULL; + + /* + * extent_buffer_page is only called after pinning the page + * by increasing the reference count. So we know the page must + * be in the radix tree. + */ + rcu_read_lock(); + p = radix_tree_lookup(&mapping->page_tree, i); + rcu_read_unlock(); + + return p; } inline unsigned long num_extent_pages(u64 start, u64 len) @@ -3854,19 +3576,6 @@ inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } -static void __free_extent_buffer(struct extent_buffer *eb) -{ -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - if (eb->pages && eb->pages != eb->inline_pages) - kfree(eb->pages); - kmem_cache_free(extent_buffer_cache, eb); -} - static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -3882,7 +3591,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - eb->tree = tree; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3899,32 +3607,20 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); #endif - spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - atomic_set(&eb->io_pages, 0); - - if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { - struct page **pages; - int num_pages = (len + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - pages = kzalloc(num_pages, mask); - if (!pages) { - __free_extent_buffer(eb); - return NULL; - } - eb->pages = pages; - } else { - eb->pages = eb->inline_pages; - } return eb; } -static int extent_buffer_under_io(struct extent_buffer *eb) +static void __free_extent_buffer(struct extent_buffer *eb) { - return (atomic_read(&eb->io_pages) || - test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || - test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_buffer_cache, eb); } /* @@ -3936,7 +3632,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; struct page *page; - BUG_ON(extent_buffer_under_io(eb)); + if (!eb->first_page) + return; index = num_extent_pages(eb->start, eb->len); if (start_idx >= index) @@ -3945,34 +3642,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) { - spin_lock(&page->mapping->private_lock); - /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. - */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - /* One for when we alloced the page */ + if (page) page_cache_release(page); - } } while (index != start_idx); } @@ -3985,50 +3656,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } -static void check_buffer_tree_ref(struct extent_buffer *eb) -{ - /* the ref bit is tricky. We have to make sure it is set - * if we have the buffer dirty. Otherwise the - * code to free a buffer can end up dropping a dirty - * page - * - * Once the ref bit is set, it won't go away while the - * buffer is dirty or in writeback, and it also won't - * go away while we have the reference count on the - * eb bumped. - * - * We can't just set the ref bit without bumping the - * ref on the eb because free_extent_buffer might - * see the ref bit and try to clear it. If this happens - * free_extent_buffer might end up dropping our original - * ref by mistake and freeing the page before we are able - * to add one more ref. - * - * So bump the ref count first, then set the bit. If someone - * beat us to it, drop the ref we added. - */ - if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { - atomic_inc(&eb->refs); - if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); - } -} - -static void mark_extent_buffer_accessed(struct extent_buffer *eb) -{ - unsigned long num_pages, i; - - check_buffer_tree_ref(eb); - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - mark_page_accessed(p); - } -} - struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len) + u64 start, unsigned long len, + struct page *page0) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -4044,7 +3674,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + mark_page_accessed(eb->first_page); return eb; } rcu_read_unlock(); @@ -4053,43 +3683,32 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (!eb) return NULL; - for (i = 0; i < num_pages; i++, index++) { + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + mark_page_accessed(page0); + set_page_extent_mapped(page0); + set_page_extent_head(page0, len); + uptodate = PageUptodate(page0); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; } - - spin_lock(&mapping->private_lock); - if (PagePrivate(p)) { - /* - * We could have already allocated an eb for this page - * and attached one so lets see if we can get a ref on - * the existing eb, and if we can we know it's good and - * we can just return that one, else we know we can just - * overwrite page->private. - */ - exists = (struct extent_buffer *)p->private; - if (atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - mark_extent_buffer_accessed(exists); - goto free_eb; - } - - /* - * Do this so attach doesn't complain and we need to - * drop the ref the old guy had. - */ - ClearPagePrivate(p); - WARN_ON(PageDirty(p)); - page_cache_release(p); - } - attach_extent_buffer_page(eb, p); - spin_unlock(&mapping->private_lock); - WARN_ON(PageDirty(p)); + set_page_extent_mapped(p); mark_page_accessed(p); - eb->pages[i] = p; + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } if (!PageUptodate(p)) uptodate = 0; @@ -4097,10 +3716,12 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * see below about how we avoid a nasty race with release page * and why we unlock later */ + if (i != 0) + unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto free_eb; @@ -4110,21 +3731,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (ret == -EEXIST) { exists = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - exists = NULL; - goto again; - } + /* add one reference for the caller */ + atomic_inc(&exists->refs); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); - mark_extent_buffer_accessed(exists); goto free_eb; } /* add one reference for the tree */ - spin_lock(&eb->refs_lock); - check_buffer_tree_ref(eb); - spin_unlock(&eb->refs_lock); + atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -4137,20 +3751,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - SetPageChecked(eb->pages[0]); - for (i = 1; i < num_pages; i++) { - p = extent_buffer_page(eb, i); - ClearPageChecked(p); - unlock_page(p); - } - unlock_page(eb->pages[0]); + set_page_extent_mapped(eb->first_page); + set_page_extent_head(eb->first_page, eb->len); + if (!page0) + unlock_page(eb->first_page); return eb; free_eb: - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) - unlock_page(eb->pages[i]); - } + if (eb->first_page && !page0) + unlock_page(eb->first_page); if (!atomic_dec_and_test(&eb->refs)) return exists; @@ -4167,7 +3776,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + mark_page_accessed(eb->first_page); return eb; } rcu_read_unlock(); @@ -4175,78 +3784,25 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) -{ - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - __free_extent_buffer(eb); -} - -/* Expects to have eb->eb_lock already held */ -static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) -{ - WARN_ON(atomic_read(&eb->refs) == 0); - if (atomic_dec_and_test(&eb->refs)) { - struct extent_io_tree *tree = eb->tree; - - spin_unlock(&eb->refs_lock); - - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, - eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); - - /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_page(eb, 0); - - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return; - } - spin_unlock(&eb->refs_lock); -} - void free_extent_buffer(struct extent_buffer *eb) { if (!eb) return; - spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && - !extent_buffer_under_io(eb) && - test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); - - /* - * I know this is terrible, but it's temporary until we stop tracking - * the uptodate bits and such for the extent buffers. - */ - release_extent_buffer(eb, GFP_ATOMIC); -} - -void free_extent_buffer_stale(struct extent_buffer *eb) -{ - if (!eb) + if (!atomic_dec_and_test(&eb->refs)) return; - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_STALE, &eb->bflags); - - if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && - test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); - release_extent_buffer(eb, GFP_NOFS); + WARN_ON(1); } -int clear_extent_buffer_dirty(struct extent_buffer *eb) +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; struct page *page; num_pages = num_extent_pages(eb->start, eb->len); - WARN_ON(atomic_read(&eb->refs) == 0); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); @@ -4256,6 +3812,10 @@ int clear_extent_buffer_dirty(struct extent_buffer *eb) lock_page(page); WARN_ON(!PagePrivate(page)); + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -4267,30 +3827,24 @@ int clear_extent_buffer_dirty(struct extent_buffer *eb) ClearPageError(page); unlock_page(page); } - WARN_ON(atomic_read(&eb->refs) == 0); return 0; } -int set_extent_buffer_dirty(struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; - check_buffer_tree_ref(eb); - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); - WARN_ON(atomic_read(&eb->refs) == 0); - WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); - for (i = 0; i < num_pages; i++) - set_page_dirty(extent_buffer_page(eb, i)); + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); return was_dirty; } -static int range_straddles_pages(u64 start, u64 len) +static int __eb_straddles_pages(u64 start, u64 len) { if (len < PAGE_CACHE_SIZE) return 1; @@ -4301,14 +3855,25 @@ static int range_straddles_pages(u64 start, u64 len) return 0; } -int clear_extent_buffer_uptodate(struct extent_buffer *eb) +static int eb_straddles_pages(struct extent_buffer *eb) +{ + return __eb_straddles_pages(eb->start, eb->len); +} + +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state **cached_state) { unsigned long i; struct page *page; unsigned long num_pages; - clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -4317,16 +3882,27 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb) return 0; } -int set_extent_buffer_uptodate(struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); + + if (eb_straddles_pages(eb)) { + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + NULL, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } SetPageUptodate(page); } return 0; @@ -4341,7 +3917,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - if (range_straddles_pages(start, end - start + 1)) { + if (__eb_straddles_pages(start, end - start + 1)) { ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) @@ -4363,9 +3939,35 @@ int extent_range_uptodate(struct extent_io_tree *tree, return pg_uptodate; } -int extent_buffer_uptodate(struct extent_buffer *eb) +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state *cached_state) { - return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + int ret = 0; + unsigned long num_pages; + unsigned long i; + struct page *page; + int pg_uptodate = 1; + + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 1; + + if (eb_straddles_pages(eb)) { + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, cached_state); + if (ret) + return ret; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + pg_uptodate = 0; + break; + } + } + return pg_uptodate; } int read_extent_buffer_pages(struct extent_io_tree *tree, @@ -4379,14 +3981,21 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; + int inc_all_pages = 0; unsigned long num_pages; - unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + if (eb_straddles_pages(eb)) { + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } + } + if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -4405,10 +4014,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) { - num_reads++; + if (!PageUptodate(page)) all_uptodate = 0; - } } if (all_uptodate) { if (start_i == 0) @@ -4416,12 +4023,20 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = 0; - atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); + + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + + if (inc_all_pages) + page_cache_get(page); if (!PageUptodate(page)) { + if (start_i == 0) + inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, @@ -4446,6 +4061,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } + if (!ret) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -4687,20 +4304,15 @@ static void copy_pages(struct page *dst_page, struct page *src_page, { char *dst_kaddr = page_address(dst_page); char *src_kaddr; - int must_memmove = 0; if (dst_page != src_page) { src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; - if (areas_overlap(src_off, dst_off, len)) - must_memmove = 1; + BUG_ON(areas_overlap(src_off, dst_off, len)); } - if (must_memmove) - memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); - else - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -4770,7 +4382,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (dst_offset < src_offset) { + if (!areas_overlap(src_offset, dst_offset, len)) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } @@ -4796,48 +4408,47 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -int try_release_extent_buffer(struct page *page, gfp_t mask) +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) { - struct extent_buffer *eb; + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); - /* - * We need to make sure noboody is attaching this page to an eb right - * now. - */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); - return 1; - } + btrfs_release_extent_buffer(eb); +} - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +{ + u64 start = page_offset(page); + struct extent_buffer *eb; + int ret = 1; - /* - * This is a little awful but should be ok, we need to make sure that - * the eb doesn't disappear out from under us while we're looking at - * this page. - */ - spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { - spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); - return 0; + spin_lock(&tree->buffer_lock); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (!eb) { + spin_unlock(&tree->buffer_lock); + return ret; } - spin_unlock(&page->mapping->private_lock); - if ((mask & GFP_NOFS) == GFP_NOFS) - mask = GFP_NOFS; + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + ret = 0; + goto out; + } /* - * If tree ref isn't set then we know the ref on this eb is a real ref, - * so just return, this page will likely be freed soon anyway. + * set @eb->refs to 0 if it is already 1, and then release the @eb. + * Or go back. */ - if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { - spin_unlock(&eb->refs_lock); - return 0; + if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { + ret = 0; + goto out; } - release_extent_buffer(eb, mask); - return 1; + radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); +out: + spin_unlock(&tree->buffer_lock); + + /* at this point we can safely release the extent buffer */ + if (atomic_read(&eb->refs) == 0) + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return ret; } diff --git a/trunk/fs/btrfs/extent_io.h b/trunk/fs/btrfs/extent_io.h index 38c1af7092f3..cecc3518c121 100644 --- a/trunk/fs/btrfs/extent_io.h +++ b/trunk/fs/btrfs/extent_io.h @@ -35,10 +35,6 @@ #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ -#define EXTENT_BUFFER_TREE_REF 5 -#define EXTENT_BUFFER_STALE 6 -#define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_IOERR 8 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -58,7 +54,6 @@ #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; -struct btrfs_root; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -74,7 +69,9 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); + int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, int failed_mirror, + struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, struct extent_state *state); @@ -100,7 +97,6 @@ struct extent_io_tree { struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; - int track_uptodate; spinlock_t lock; spinlock_t buffer_lock; struct extent_io_ops *ops; @@ -123,21 +119,16 @@ struct extent_state { struct list_head leak_list; }; -#define INLINE_EXTENT_BUFFER_PAGES 16 -#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) struct extent_buffer { u64 start; unsigned long len; unsigned long map_start; unsigned long map_len; + struct page *first_page; unsigned long bflags; - struct extent_io_tree *tree; - spinlock_t refs_lock; - atomic_t refs; - atomic_t io_pages; - int failed_mirror; struct list_head leak_list; struct rcu_head rcu_head; + atomic_t refs; pid_t lock_owner; /* count of read lock holders on the extent buffer */ @@ -161,9 +152,6 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; - wait_queue_head_t lock_wq; - struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; - struct page **pages; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -190,7 +178,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page, gfp_t mask); +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -252,8 +240,6 @@ int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, get_extent_t *get_extent, struct writeback_control *wbc); -int btree_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, struct list_head *pages, unsigned nr_pages, @@ -265,11 +251,11 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len); + u64 start, unsigned long len, + struct page *page0); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); -void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 @@ -302,11 +288,18 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -int clear_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_buffer *eb); -int extent_buffer_uptodate(struct extent_buffer *eb); +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state **cached_state); +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state *cached_state); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, @@ -327,6 +320,4 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, - int mirror_num); #endif diff --git a/trunk/fs/btrfs/inode-item.c b/trunk/fs/btrfs/inode-item.c index 6ea71c60e80a..baa74f3db691 100644 --- a/trunk/fs/btrfs/inode-item.c +++ b/trunk/fs/btrfs/inode-item.c @@ -19,7 +19,6 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" static int find_name_in_backref(struct btrfs_path *path, const char *name, int name_len, struct btrfs_inode_ref **ref_ret) diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c index 341a8670165f..892b34785ccc 100644 --- a/trunk/fs/btrfs/inode.c +++ b/trunk/fs/btrfs/inode.c @@ -634,7 +634,8 @@ static noinline int submit_compressed_extents(struct inode *inode, ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, &ins, 1); + 0, alloc_hint, + (u64)-1, &ins, 1); btrfs_end_transaction(trans, root); if (ret) { @@ -837,7 +838,7 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - &ins, 1); + (u64)-1, &ins, 1); BUG_ON(ret); em = alloc_extent_map(); @@ -5413,7 +5414,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, - alloc_hint, &ins, 1); + alloc_hint, (u64)-1, &ins, 1); if (ret) { em = ERR_PTR(ret); goto out; @@ -6782,8 +6783,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); - ei->io_tree.track_uptodate = 1; - ei->io_failure_tree.track_uptodate = 1; mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -7316,7 +7315,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, &ins, 1); + 0, *alloc_hint, (u64)-1, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); diff --git a/trunk/fs/btrfs/reada.c b/trunk/fs/btrfs/reada.c index dc5d33146fdb..22db04550f6a 100644 --- a/trunk/fs/btrfs/reada.c +++ b/trunk/fs/btrfs/reada.c @@ -54,6 +54,7 @@ * than the 2 started one after another. */ +#define MAX_MIRRORS 2 #define MAX_IN_FLIGHT 6 struct reada_extctl { @@ -70,7 +71,7 @@ struct reada_extent { struct list_head extctl; struct kref refcnt; spinlock_t lock; - struct reada_zone *zones[BTRFS_MAX_MIRRORS]; + struct reada_zone *zones[MAX_MIRRORS]; int nzones; struct btrfs_device *scheduled_for; }; @@ -83,8 +84,7 @@ struct reada_zone { spinlock_t lock; int locked; struct btrfs_device *device; - struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl - * self */ + struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ int ndevs; struct kref refcnt; }; @@ -365,9 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (ret || !bbio || length < blocksize) goto error; - if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { + if (bbio->num_stripes > MAX_MIRRORS) { printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", BTRFS_MAX_MIRRORS); + "supported", MAX_MIRRORS); goto error; } diff --git a/trunk/fs/btrfs/scrub.c b/trunk/fs/btrfs/scrub.c index 5221e072bb65..abc0fbffa510 100644 --- a/trunk/fs/btrfs/scrub.c +++ b/trunk/fs/btrfs/scrub.c @@ -36,30 +36,37 @@ * Future enhancements: * - In case an unrepairable extent is encountered, track which files are * affected and report them + * - In case of a read error on files with nodatasum, map the file and read + * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space */ -struct scrub_block; +struct scrub_bio; +struct scrub_page; struct scrub_dev; +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_checksum(struct btrfs_work *work); +static int scrub_checksum_data(struct scrub_dev *sdev, + struct scrub_page *spag, void *buffer); +static int scrub_checksum_tree_block(struct scrub_dev *sdev, + struct scrub_page *spag, u64 logical, + void *buffer); +static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); +static int scrub_fixup_check(struct scrub_bio *sbio, int ix); +static void scrub_fixup_end_io(struct bio *bio, int err); +static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, + struct page *page); +static void scrub_fixup(struct scrub_bio *sbio, int ix); #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ -#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { - struct scrub_block *sblock; - struct page *page; - struct block_device *bdev; u64 flags; /* extent flags */ u64 generation; - u64 logical; - u64 physical; - struct { - unsigned int mirror_num:8; - unsigned int have_csum:1; - unsigned int io_error:1; - }; + int mirror_num; + int have_csum; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -70,25 +77,12 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; - int page_count; + struct scrub_page spag[SCRUB_PAGES_PER_BIO]; + u64 count; int next_free; struct btrfs_work work; }; -struct scrub_block { - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; - int page_count; - atomic_t outstanding_pages; - atomic_t ref_count; /* free mem on transition to zero */ - struct scrub_dev *sdev; - struct { - unsigned int header_error:1; - unsigned int checksum_error:1; - unsigned int no_io_error_seen:1; - }; -}; - struct scrub_dev { struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; struct btrfs_device *dev; @@ -102,10 +96,6 @@ struct scrub_dev { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ - u32 sectorsize; - u32 nodesize; - u32 leafsize; /* * statistics */ @@ -134,43 +124,6 @@ struct scrub_warning { int scratch_bufsize; }; - -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, - struct btrfs_mapping_tree *map_tree, - u64 length, u64 logical, - struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size); -static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int is_metadata, int have_csum, - const u8 *csum, u64 generation, - u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err); -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write); -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write); -static int scrub_checksum_data(struct scrub_block *sblock); -static int scrub_checksum_tree_block(struct scrub_block *sblock); -static int scrub_checksum_super(struct scrub_block *sblock); -static void scrub_block_get(struct scrub_block *sblock); -static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, - struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force); -static void scrub_bio_end_io(struct bio *bio, int err); -static void scrub_bio_end_io_worker(struct btrfs_work *work); -static void scrub_block_complete(struct scrub_block *sblock); - - static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -182,30 +135,37 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +static void scrub_free_bio(struct bio *bio) { int i; + struct page *last_page = NULL; - if (!sdev) + if (!bio) return; - /* this can happen when scrub is cancelled */ - if (sdev->curr != -1) { - struct scrub_bio *sbio = sdev->bios[sdev->curr]; - - for (i = 0; i < sbio->page_count; i++) { - BUG_ON(!sbio->pagev[i]); - BUG_ON(!sbio->pagev[i]->page); - scrub_block_put(sbio->pagev[i]->sblock); - } - bio_put(sbio->bio); + for (i = 0; i < bio->bi_vcnt; ++i) { + if (bio->bi_io_vec[i].bv_page == last_page) + continue; + last_page = bio->bi_io_vec[i].bv_page; + __free_page(last_page); } + bio_put(bio); +} + +static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +{ + int i; + + if (!sdev) + return; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; if (!sbio) break; + + scrub_free_bio(sbio->bio); kfree(sbio); } @@ -219,16 +179,11 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) struct scrub_dev *sdev; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_bio; - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, - bio_get_nr_vecs(dev->bdev)); sdev = kzalloc(sizeof(*sdev), GFP_NOFS); if (!sdev) goto nomem; sdev->dev = dev; - sdev->pages_per_bio = pages_per_bio; - sdev->curr = -1; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio; @@ -239,8 +194,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sbio->index = i; sbio->sdev = sdev; - sbio->page_count = 0; - sbio->work.func = scrub_bio_end_io_worker; + sbio->count = 0; + sbio->work.func = scrub_checksum; if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -248,9 +203,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->bios[i]->next_free = -1; } sdev->first_free = 0; - sdev->nodesize = dev->dev_root->nodesize; - sdev->leafsize = dev->dev_root->leafsize; - sdev->sectorsize = dev->dev_root->sectorsize; + sdev->curr = -1; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); @@ -341,9 +294,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) return 0; } -static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) +static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, + int ix) { - struct btrfs_device *dev = sblock->sdev->dev; + struct btrfs_device *dev = sbio->sdev->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -362,9 +316,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - BUG_ON(sblock->page_count < 1); - swarn.sector = (sblock->pagev[0].physical) >> 9; - swarn.logical = sblock->pagev[0].logical; + swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + swarn.logical = sbio->logical + ix * PAGE_SIZE; swarn.errstr = errstr; swarn.dev = dev; swarn.msg_bufsize = bufsize; @@ -389,8 +342,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING - "btrfs: %s at logical %llu on dev %s, " + printk(KERN_WARNING "%s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, dev->name, (unsigned long long)swarn.sector, @@ -579,9 +531,9 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR - "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, sdev->dev->name); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup " + "(nodatasum) error at logical %llu\n", + fixup->logical); } btrfs_free_path(path); @@ -598,168 +550,91 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) } /* - * scrub_handle_errored_block gets called when either verification of the - * pages failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all pages in the bio, even though only one - * may be bad. - * The goal of this function is to repair the errored block by using the - * contents of one of the mirrors. + * scrub_recheck_error gets called when either verification of the page + * failed or the bio failed to read, e.g. with EIO. In the latter case, + * recheck_error gets called for every page in the bio, even though only + * one may be bad */ -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) +static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { - struct scrub_dev *sdev = sblock_to_check->sdev; - struct btrfs_fs_info *fs_info; - u64 length; - u64 logical; - u64 generation; - unsigned int failed_mirror_index; - unsigned int is_metadata; - unsigned int have_csum; - u8 *csum; - struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ - struct scrub_block *sblock_bad; - int ret; - int mirror_index; - int page_num; - int success; + struct scrub_dev *sdev = sbio->sdev; + u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - BUG_ON(sblock_to_check->page_count < 1); - fs_info = sdev->dev->dev_root->fs_info; - length = sblock_to_check->page_count * PAGE_SIZE; - logical = sblock_to_check->pagev[0].logical; - generation = sblock_to_check->pagev[0].generation; - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0].flags & - BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0].have_csum; - csum = sblock_to_check->pagev[0].csum; - - /* - * read all mirrors one after the other. This includes to - * re-read the extent or metadata block that failed (that was - * the cause that this fixup code is called) another time, - * page by page this time in order to know which pages - * caused I/O errors and which ones are good (for all mirrors). - * It is the goal to handle the situation when more than one - * mirror contains I/O errors, but the errors do not - * overlap, i.e. the data can be repaired by selecting the - * pages from those mirrors without I/O error on the - * particular pages. One example (with blocks >= 2 * PAGE_SIZE) - * would be that mirror #1 has an I/O error on the first page, - * the second page is good, and mirror #2 has an I/O error on - * the second page, but the first page is good. - * Then the first page of the first mirror can be repaired by - * taking the first page of the second mirror, and the - * second page of the second mirror can be repaired by - * copying the contents of the 2nd page of the 1st mirror. - * One more note: if the pages of one mirror contain I/O - * errors, the checksum cannot be verified. In order to get - * the best data for repairing, the first attempt is to find - * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the pages are picked from - * mirrors with I/O errors without considering the checksum. - * If the latter is the case, at the end, the checksum of the - * repaired area is verified in order to correctly maintain - * the statistics. - */ - - sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * - sizeof(*sblocks_for_recheck), - GFP_NOFS); - if (!sblocks_for_recheck) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - goto out; - } + DEFAULT_RATELIMIT_BURST); - /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, - logical, sblocks_for_recheck); - if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - goto out; + if (sbio->err) { + if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, + sbio->bio->bi_io_vec[ix].bv_page) == 0) { + if (scrub_fixup_check(sbio, ix) == 0) + return 0; + } + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sbio, ix); + } else { + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sbio, ix); } - BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); - sblock_bad = sblocks_for_recheck + failed_mirror_index; - /* build and submit the bios for the failed mirror, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sdev->csum_size); - if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - goto out; - } + spin_lock(&sdev->stat_lock); + ++sdev->stat.read_errors; + spin_unlock(&sdev->stat_lock); - if (!sblock_bad->header_error && !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) { - /* - * the error disappeared after reading page by page, or - * the area was part of a huge bio and other parts of the - * bio caused I/O errors, or the block layer merged several - * read requests into one and the error is caused by a - * different bio (usually one of the two latter cases is - * the cause) - */ - spin_lock(&sdev->stat_lock); - sdev->stat.unverified_errors++; - spin_unlock(&sdev->stat_lock); + scrub_fixup(sbio, ix); + return 1; +} - goto out; - } +static int scrub_fixup_check(struct scrub_bio *sbio, int ix) +{ + int ret = 1; + struct page *page; + void *buffer; + u64 flags = sbio->spag[ix].flags; - if (!sblock_bad->no_io_error_seen) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - spin_unlock(&sdev->stat_lock); - if (__ratelimit(&_rs)) - scrub_print_warning("i/o error", sblock_to_check); - } else if (sblock_bad->checksum_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.csum_errors++; - spin_unlock(&sdev->stat_lock); - if (__ratelimit(&_rs)) - scrub_print_warning("checksum error", sblock_to_check); - } else if (sblock_bad->header_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.verify_errors++; - spin_unlock(&sdev->stat_lock); - if (__ratelimit(&_rs)) - scrub_print_warning("checksum/header error", - sblock_to_check); + page = sbio->bio->bi_io_vec[ix].bv_page; + buffer = kmap_atomic(page, KM_USER0); + if (flags & BTRFS_EXTENT_FLAG_DATA) { + ret = scrub_checksum_data(sbio->sdev, + sbio->spag + ix, buffer); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = scrub_checksum_tree_block(sbio->sdev, + sbio->spag + ix, + sbio->logical + ix * PAGE_SIZE, + buffer); + } else { + WARN_ON(1); } + kunmap_atomic(buffer, KM_USER0); - if (sdev->readonly) - goto did_not_correct_error; + return ret; +} - if (!is_metadata && !have_csum) { - struct scrub_fixup_nodatasum *fixup_nodatasum; +static void scrub_fixup_end_io(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} - /* - * !is_metadata and !have_csum, this means that the data - * might not be COW'ed, that it might be modified - * concurrently. The general strategy to work on the - * commit root does not help in the case when COW is not - * used. - */ - fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); - if (!fixup_nodatasum) - goto did_not_correct_error; - fixup_nodatasum->sdev = sdev; - fixup_nodatasum->logical = logical; - fixup_nodatasum->root = fs_info->extent_root; - fixup_nodatasum->mirror_num = failed_mirror_index + 1; +static void scrub_fixup(struct scrub_bio *sbio, int ix) +{ + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_bio *bbio = NULL; + struct scrub_fixup_nodatasum *fixup; + u64 logical = sbio->logical + ix * PAGE_SIZE; + u64 length; + int i; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); + + if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && + (sbio->spag[ix].have_csum == 0)) { + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + goto uncorrectable; + fixup->sdev = sdev; + fixup->logical = logical; + fixup->root = fs_info->extent_root; + fixup->mirror_num = sbio->spag[ix].mirror_num; /* * increment scrubs_running to prevent cancel requests from * completing as long as a fixup worker is running. we must also @@ -774,529 +649,235 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); atomic_inc(&sdev->fixup_cnt); - fixup_nodatasum->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, - &fixup_nodatasum->work); - goto out; + fixup->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); + return; } - /* - * now build and submit the bios for the other mirrors, check - * checksums - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - if (mirror_index == failed_mirror_index) - continue; - - /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, - sblocks_for_recheck + mirror_index, - is_metadata, have_csum, csum, - generation, sdev->csum_size); - if (ret) - goto did_not_correct_error; + length = PAGE_SIZE; + ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { + printk(KERN_ERR + "scrub_fixup: btrfs_map_block failed us for %llu\n", + (unsigned long long)logical); + WARN_ON(1); + kfree(bbio); + return; } - /* - * first try to pick the mirror which is completely without I/O - * errors and also does not have a checksum error. - * If one is found, and if a checksum is present, the full block - * that is known to contain an error is rewritten. Afterwards - * the block is known to be corrected. - * If a mirror is found which is completely correct, and no - * checksum is present, only those pages are rewritten that had - * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other pages is better (and it - * could happen otherwise that a correct page would be - * overwritten by a bad one). - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - - if (!sblock_other->header_error && - !sblock_other->checksum_error && - sblock_other->no_io_error_seen) { - int force_write = is_metadata || have_csum; - - ret = scrub_repair_block_from_good_copy(sblock_bad, - sblock_other, - force_write); - if (0 == ret) - goto corrected_error; - } - } + if (bbio->num_stripes == 1) + /* there aren't any replicas */ + goto uncorrectable; /* - * in case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those pages. - * Select the good pages from mirrors to rewrite bad pages from - * the area to fix. Afterwards verify the checksum of the block - * that is supposed to be repaired. This verification step is - * only done for the purpose of statistic counting and for the - * final scrub report, whether errors remain. - * A perfect algorithm could make use of the checksum and try - * all possible combinations of pages from the different mirrors - * until the checksum verification succeeds. For example, when - * the 2nd page of mirror #1 faces I/O errors, and the 2nd page - * of mirror #2 is readable but the final checksum test fails, - * then the 2nd page of mirror #3 could be tried, whether now - * the final checksum succeedes. But this would be a rare - * exception and is therefore not implemented. At least it is - * avoided that the good copy is overwritten. - * A more useful improvement would be to pick the sectors - * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one - * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same PAGE_SIZE - * area are unreadable. + * first find a good copy */ - - /* can only fix I/O errors from here on */ - if (sblock_bad->no_io_error_seen) - goto did_not_correct_error; - - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; - - if (!page_bad->io_error) + for (i = 0; i < bbio->num_stripes; ++i) { + if (i + 1 == sbio->spag[ix].mirror_num) continue; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - struct scrub_page *page_other = sblock_other->pagev + - page_num; - - if (!page_other->io_error) { - ret = scrub_repair_page_from_good_copy( - sblock_bad, sblock_other, page_num, 0); - if (0 == ret) { - page_bad->io_error = 0; - break; /* succeeded for this page */ - } - } - } - - if (page_bad->io_error) { - /* did not find a mirror to copy the page from */ - success = 0; - } - } - - if (success) { - if (is_metadata || have_csum) { - /* - * need to verify the checksum now that all - * sectors on disk are repaired (the write - * request for data to be repaired is on its way). - * Just be lazy and use scrub_recheck_block() - * which re-reads the data before the checksum - * is verified, but most likely the data comes out - * of the page cache. - */ - ret = scrub_recheck_block(fs_info, sblock_bad, - is_metadata, have_csum, csum, - generation, sdev->csum_size); - if (!ret && !sblock_bad->header_error && - !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) - goto corrected_error; - else - goto did_not_correct_error; - } else { -corrected_error: - spin_lock(&sdev->stat_lock); - sdev->stat.corrected_errors++; - spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR - "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, + sbio->bio->bi_io_vec[ix].bv_page)) { + /* I/O-error, this is not a good copy */ + continue; } - } else { -did_not_correct_error: - spin_lock(&sdev->stat_lock); - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR - "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); - } -out: - if (sblocks_for_recheck) { - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; - mirror_index++) { - struct scrub_block *sblock = sblocks_for_recheck + - mirror_index; - int page_index; - - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; - page_index++) - if (sblock->pagev[page_index].page) - __free_page( - sblock->pagev[page_index].page); - } - kfree(sblocks_for_recheck); + if (scrub_fixup_check(sbio, ix) == 0) + break; } + if (i == bbio->num_stripes) + goto uncorrectable; - return 0; -} - -static int scrub_setup_recheck_block(struct scrub_dev *sdev, - struct btrfs_mapping_tree *map_tree, - u64 length, u64 logical, - struct scrub_block *sblocks_for_recheck) -{ - int page_index; - int mirror_index; - int ret; - - /* - * note: the three members sdev, ref_count and outstanding_pages - * are not used (and not set) in the blocks that are used for - * the recheck procedure - */ - - page_index = 0; - while (length > 0) { - u64 sublen = min_t(u64, length, PAGE_SIZE); - u64 mapped_length = sublen; - struct btrfs_bio *bbio = NULL; - + if (!sdev->readonly) { /* - * with a length of PAGE_SIZE, each returned stripe - * represents one mirror + * bi_io_vec[ix].bv_page now contains good data, write it back */ - ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, - &bbio, 0); - if (ret || !bbio || mapped_length < sublen) { - kfree(bbio); - return -EIO; - } - - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); - for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; - mirror_index++) { - struct scrub_block *sblock; - struct scrub_page *page; - - if (mirror_index >= BTRFS_MAX_MIRRORS) - continue; - - sblock = sblocks_for_recheck + mirror_index; - page = sblock->pagev + page_index; - page->logical = logical; - page->physical = bbio->stripes[mirror_index].physical; - page->bdev = bbio->stripes[mirror_index].dev->bdev; - page->mirror_num = mirror_index + 1; - page->page = alloc_page(GFP_NOFS); - if (!page->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); - return -ENOMEM; - } - sblock->page_count++; + if (scrub_fixup_io(WRITE, sdev->dev->bdev, + (sbio->physical + ix * PAGE_SIZE) >> 9, + sbio->bio->bi_io_vec[ix].bv_page)) { + /* I/O-error, writeback failed, give up */ + goto uncorrectable; } - kfree(bbio); - length -= sublen; - logical += sublen; - page_index++; } - return 0; -} - -/* - * this function will check the on disk data for checksum errors, header - * errors and read I/O errors. If any I/O errors happen, the exact pages - * which are errored are marked as being bad. The goal is to enable scrub - * to take those pages that are not errored from all the mirrors so that - * the pages that are errored in the just handled mirror can be repaired. - */ -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size) -{ - int page_num; - - sblock->no_io_error_seen = 1; - sblock->header_error = 0; - sblock->checksum_error = 0; - - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct bio *bio; - int ret; - struct scrub_page *page = sblock->pagev + page_num; - DECLARE_COMPLETION_ONSTACK(complete); - - BUG_ON(!page->page); - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = page->bdev; - bio->bi_sector = page->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; - - ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { - bio_put(bio); - return -EIO; - } - btrfsic_submit_bio(READ, bio); + kfree(bbio); + spin_lock(&sdev->stat_lock); + ++sdev->stat.corrected_errors; + spin_unlock(&sdev->stat_lock); - /* this will also unplug the queue */ - wait_for_completion(&complete); + printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", + (unsigned long long)logical); + return; - page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - sblock->no_io_error_seen = 0; - bio_put(bio); - } - - if (sblock->no_io_error_seen) - scrub_recheck_block_checksum(fs_info, sblock, is_metadata, - have_csum, csum, generation, - csum_size); +uncorrectable: + kfree(bbio); + spin_lock(&sdev->stat_lock); + ++sdev->stat.uncorrectable_errors; + spin_unlock(&sdev->stat_lock); - return 0; + printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " + "logical %llu\n", (unsigned long long)logical); } -static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int is_metadata, int have_csum, - const u8 *csum, u64 generation, - u16 csum_size) +static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, + struct page *page) { - int page_num; - u8 calculated_csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; - struct btrfs_root *root = fs_info->extent_root; - void *mapped_buffer; - - BUG_ON(!sblock->pagev[0].page); - if (is_metadata) { - struct btrfs_header *h; - - mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); - h = (struct btrfs_header *)mapped_buffer; - - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || - generation != le64_to_cpu(h->generation) || - memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || - memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) - sblock->header_error = 1; - csum = h->csum; - } else { - if (!have_csum) - return; - - mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); - } - - for (page_num = 0;;) { - if (page_num == 0 && is_metadata) - crc = btrfs_csum_data(root, - ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, - crc, PAGE_SIZE - BTRFS_CSUM_SIZE); - else - crc = btrfs_csum_data(root, mapped_buffer, crc, - PAGE_SIZE); - - kunmap_atomic(mapped_buffer, KM_USER0); - page_num++; - if (page_num >= sblock->page_count) - break; - BUG_ON(!sblock->pagev[page_num].page); + struct bio *bio = NULL; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); - mapped_buffer = kmap_atomic(sblock->pagev[page_num].page, - KM_USER0); - } + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, PAGE_SIZE, 0); + bio->bi_end_io = scrub_fixup_end_io; + bio->bi_private = &complete; + btrfsic_submit_bio(rw, bio); - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, csum, csum_size)) - sblock->checksum_error = 1; -} + /* this will also unplug the queue */ + wait_for_completion(&complete); -static void scrub_complete_bio_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); + ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ret; } -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write) +static void scrub_bio_end_io(struct bio *bio, int err) { - int page_num; - int ret = 0; - - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - int ret_sub; + struct scrub_bio *sbio = bio->bi_private; + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - ret_sub = scrub_repair_page_from_good_copy(sblock_bad, - sblock_good, - page_num, - force_write); - if (ret_sub) - ret = ret_sub; - } + sbio->err = err; + sbio->bio = bio; - return ret; + btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); } -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write) +static void scrub_checksum(struct btrfs_work *work) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; - struct scrub_page *page_good = sblock_good->pagev + page_num; + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_dev *sdev = sbio->sdev; + struct page *page; + void *buffer; + int i; + u64 flags; + u64 logical; + int ret; - BUG_ON(sblock_bad->pagev[page_num].page == NULL); - BUG_ON(sblock_good->pagev[page_num].page == NULL); - if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || page_bad->io_error) { - struct bio *bio; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); - - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = page_bad->bdev; - bio->bi_sector = page_bad->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; - - ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { - bio_put(bio); - return -EIO; + if (sbio->err) { + ret = 0; + for (i = 0; i < sbio->count; ++i) + ret |= scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); } - btrfsic_submit_bio(WRITE, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - bio_put(bio); - } - return 0; -} + sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bio->bi_phys_segments = 0; + sbio->bio->bi_idx = 0; -static void scrub_checksum(struct scrub_block *sblock) -{ - u64 flags; - int ret; + for (i = 0; i < sbio->count; i++) { + struct bio_vec *bi; + bi = &sbio->bio->bi_io_vec[i]; + bi->bv_offset = 0; + bi->bv_len = PAGE_SIZE; + } + goto out; + } + for (i = 0; i < sbio->count; ++i) { + page = sbio->bio->bi_io_vec[i].bv_page; + buffer = kmap_atomic(page, KM_USER0); + flags = sbio->spag[i].flags; + logical = sbio->logical + i * PAGE_SIZE; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) { + ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = scrub_checksum_tree_block(sdev, sbio->spag + i, + logical, buffer); + } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { + BUG_ON(i); + (void)scrub_checksum_super(sbio, buffer); + } else { + WARN_ON(1); + } + kunmap_atomic(buffer, KM_USER0); + if (ret) { + ret = scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } + } + } - BUG_ON(sblock->page_count < 1); - flags = sblock->pagev[0].flags; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) - ret = scrub_checksum_data(sblock); - else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = scrub_checksum_tree_block(sblock); - else if (flags & BTRFS_EXTENT_FLAG_SUPER) - (void)scrub_checksum_super(sblock); - else - WARN_ON(1); - if (ret) - scrub_handle_errored_block(sblock); +out: + scrub_free_bio(sbio->bio); + sbio->bio = NULL; + spin_lock(&sdev->list_lock); + sbio->next_free = sdev->first_free; + sdev->first_free = sbio->index; + spin_unlock(&sdev->list_lock); + atomic_dec(&sdev->in_flight); + wake_up(&sdev->list_wait); } -static int scrub_checksum_data(struct scrub_block *sblock) +static int scrub_checksum_data(struct scrub_dev *sdev, + struct scrub_page *spag, void *buffer) { - struct scrub_dev *sdev = sblock->sdev; u8 csum[BTRFS_CSUM_SIZE]; - u8 *on_disk_csum; - struct page *page; - void *buffer; u32 crc = ~(u32)0; int fail = 0; struct btrfs_root *root = sdev->dev->dev_root; - u64 len; - int index; - BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0].have_csum) + if (!spag->have_csum) return 0; - on_disk_csum = sblock->pagev[0].csum; - page = sblock->pagev[0].page; - buffer = kmap_atomic(page, KM_USER0); - - len = sdev->sectorsize; - index = 0; - for (;;) { - u64 l = min_t(u64, len, PAGE_SIZE); - - crc = btrfs_csum_data(root, buffer, crc, l); - kunmap_atomic(buffer, KM_USER0); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; - buffer = kmap_atomic(page, KM_USER0); - } - + crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); btrfs_csum_final(crc, csum); - if (memcmp(csum, on_disk_csum, sdev->csum_size)) + if (memcmp(csum, spag->csum, sdev->csum_size)) fail = 1; - if (fail) { - spin_lock(&sdev->stat_lock); + spin_lock(&sdev->stat_lock); + ++sdev->stat.data_extents_scrubbed; + sdev->stat.data_bytes_scrubbed += PAGE_SIZE; + if (fail) ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); - } + spin_unlock(&sdev->stat_lock); return fail; } -static int scrub_checksum_tree_block(struct scrub_block *sblock) +static int scrub_checksum_tree_block(struct scrub_dev *sdev, + struct scrub_page *spag, u64 logical, + void *buffer) { - struct scrub_dev *sdev = sblock->sdev; struct btrfs_header *h; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 calculated_csum[BTRFS_CSUM_SIZE]; - u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; + u8 csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; int fail = 0; int crc_fail = 0; - u64 len; - int index; - - BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; - mapped_buffer = kmap_atomic(page, KM_USER0); - h = (struct btrfs_header *)mapped_buffer; - memcpy(on_disk_csum, h->csum, sdev->csum_size); /* * we don't use the getter functions here, as we * a) don't have an extent buffer and * b) the page is already kmapped */ + h = (struct btrfs_header *)buffer; - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) + if (logical != le64_to_cpu(h->bytenr)) ++fail; - if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) + if (spag->generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1306,99 +887,51 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - BUG_ON(sdev->nodesize != sdev->leafsize); - len = sdev->nodesize - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); - - crc = btrfs_csum_data(root, p, crc, l); - kunmap_atomic(mapped_buffer, KM_USER0); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; - mapped_buffer = kmap_atomic(page, KM_USER0); - mapped_size = PAGE_SIZE; - p = mapped_buffer; - } - - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, + PAGE_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, sdev->csum_size)) ++crc_fail; - if (crc_fail || fail) { - spin_lock(&sdev->stat_lock); - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); - } + spin_lock(&sdev->stat_lock); + ++sdev->stat.tree_extents_scrubbed; + sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; + if (crc_fail) + ++sdev->stat.csum_errors; + if (fail) + ++sdev->stat.verify_errors; + spin_unlock(&sdev->stat_lock); return fail || crc_fail; } -static int scrub_checksum_super(struct scrub_block *sblock) +static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) { struct btrfs_super_block *s; - struct scrub_dev *sdev = sblock->sdev; + u64 logical; + struct scrub_dev *sdev = sbio->sdev; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 calculated_csum[BTRFS_CSUM_SIZE]; - u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; + u8 csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; int fail = 0; - u64 len; - int index; - BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; - mapped_buffer = kmap_atomic(page, KM_USER0); - s = (struct btrfs_super_block *)mapped_buffer; - memcpy(on_disk_csum, s->csum, sdev->csum_size); + s = (struct btrfs_super_block *)buffer; + logical = sbio->logical; - if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) + if (logical != le64_to_cpu(s->bytenr)) ++fail; - if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) + if (sbio->spag[0].generation != le64_to_cpu(s->generation)) ++fail; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ++fail; - len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); - - crc = btrfs_csum_data(root, p, crc, l); - kunmap_atomic(mapped_buffer, KM_USER0); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; - mapped_buffer = kmap_atomic(page, KM_USER0); - mapped_size = PAGE_SIZE; - p = mapped_buffer; - } - - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, + PAGE_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, s->csum, sbio->sdev->csum_size)) ++fail; if (fail) { @@ -1415,42 +948,29 @@ static int scrub_checksum_super(struct scrub_block *sblock) return fail; } -static void scrub_block_get(struct scrub_block *sblock) -{ - atomic_inc(&sblock->ref_count); -} - -static void scrub_block_put(struct scrub_block *sblock) -{ - if (atomic_dec_and_test(&sblock->ref_count)) { - int i; - - for (i = 0; i < sblock->page_count; i++) - if (sblock->pagev[i].page) - __free_page(sblock->pagev[i].page); - kfree(sblock); - } -} - -static void scrub_submit(struct scrub_dev *sdev) +static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; if (sdev->curr == -1) - return; + return 0; sbio = sdev->bios[sdev->curr]; + sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); btrfsic_submit_bio(READ, sbio->bio); + + return 0; } -static int scrub_add_page_to_bio(struct scrub_dev *sdev, - struct scrub_page *spage) +static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force) { - struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; + struct page *page; int ret; again: @@ -1463,7 +983,7 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev, if (sdev->curr != -1) { sdev->first_free = sdev->bios[sdev->curr]->next_free; sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->page_count = 0; + sdev->bios[sdev->curr]->count = 0; spin_unlock(&sdev->list_lock); } else { spin_unlock(&sdev->list_lock); @@ -1471,200 +991,62 @@ static int scrub_add_page_to_bio(struct scrub_dev *sdev, } } sbio = sdev->bios[sdev->curr]; - if (sbio->page_count == 0) { + if (sbio->count == 0) { struct bio *bio; - sbio->physical = spage->physical; - sbio->logical = spage->logical; - bio = sbio->bio; - if (!bio) { - bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); - if (!bio) - return -ENOMEM; - sbio->bio = bio; - } + sbio->physical = physical; + sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = spage->physical >> 9; + bio->bi_sector = sbio->physical >> 9; sbio->err = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != - spage->physical || - sbio->logical + sbio->page_count * PAGE_SIZE != - spage->logical) { - scrub_submit(sdev); - goto again; - } - - sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { - if (sbio->page_count < 1) { - bio_put(sbio->bio); - sbio->bio = NULL; - return -EIO; - } - scrub_submit(sdev); + sbio->bio = bio; + } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || + sbio->logical + sbio->count * PAGE_SIZE != logical) { + ret = scrub_submit(sdev); + if (ret) + return ret; goto again; } + sbio->spag[sbio->count].flags = flags; + sbio->spag[sbio->count].generation = gen; + sbio->spag[sbio->count].have_csum = 0; + sbio->spag[sbio->count].mirror_num = mirror_num; - scrub_block_get(sblock); /* one for the added page */ - atomic_inc(&sblock->outstanding_pages); - sbio->page_count++; - if (sbio->page_count == sdev->pages_per_bio) - scrub_submit(sdev); - - return 0; -} - -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) -{ - struct scrub_block *sblock; - int index; - - sblock = kzalloc(sizeof(*sblock), GFP_NOFS); - if (!sblock) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + page = alloc_page(GFP_NOFS); + if (!page) return -ENOMEM; - } - - /* one ref inside this function, plus one for each page later on */ - atomic_set(&sblock->ref_count, 1); - sblock->sdev = sdev; - sblock->no_io_error_seen = 1; - - for (index = 0; len > 0; index++) { - struct scrub_page *spage = sblock->pagev + index; - u64 l = min_t(u64, len, PAGE_SIZE); - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); - while (index > 0) { - index--; - __free_page(sblock->pagev[index].page); - } - kfree(sblock); - return -ENOMEM; - } - spage->sblock = sblock; - spage->bdev = sdev->dev->bdev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->mirror_num = mirror_num; - if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sdev->csum_size); - } else { - spage->have_csum = 0; - } - sblock->page_count++; - len -= l; - logical += l; - physical += l; + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; } - BUG_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev + index; + if (csum) { + sbio->spag[sbio->count].have_csum = 1; + memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); + } + ++sbio->count; + if (sbio->count == SCRUB_PAGES_PER_BIO || force) { int ret; - ret = scrub_add_page_to_bio(sdev, spage); - if (ret) { - scrub_block_put(sblock); + ret = scrub_submit(sdev); + if (ret) return ret; - } } - if (force) - scrub_submit(sdev); - - /* last one frees, either here or in bio completion for last page */ - scrub_block_put(sblock); return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) -{ - struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - - sbio->err = err; - sbio->bio = bio; - - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); -} - -static void scrub_bio_end_io_worker(struct btrfs_work *work) -{ - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; - int i; - - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); - if (sbio->err) { - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - - spage->io_error = 1; - spage->sblock->no_io_error_seen = 0; - } - } - - /* now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - struct scrub_block *sblock = spage->sblock; - - if (atomic_dec_and_test(&sblock->outstanding_pages)) - scrub_block_complete(sblock); - scrub_block_put(sblock); - } - - if (sbio->err) { - /* what is this good for??? */ - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; - - for (i = 0; i < sbio->page_count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - } - - bio_put(sbio->bio); - sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); -} - -static void scrub_block_complete(struct scrub_block *sblock) -{ - if (!sblock->no_io_error_seen) - scrub_handle_errored_block(sblock); - else - scrub_checksum(sblock); -} - static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum) { @@ -1672,6 +1054,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, int ret = 0; unsigned long i; unsigned long num_sectors; + u32 sectorsize = sdev->dev->dev_root->sectorsize; while (!list_empty(&sdev->csum_list)) { sum = list_first_entry(&sdev->csum_list, @@ -1689,7 +1072,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sdev->sectorsize; + num_sectors = sum->len / sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { memcpy(csum, &sum->sums[i].sum, sdev->csum_size); @@ -1710,28 +1093,9 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, { int ret; u8 csum[BTRFS_CSUM_SIZE]; - u32 blocksize; - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sdev->sectorsize; - spin_lock(&sdev->stat_lock); - sdev->stat.data_extents_scrubbed++; - sdev->stat.data_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sdev->nodesize != sdev->leafsize); - blocksize = sdev->nodesize; - spin_lock(&sdev->stat_lock); - sdev->stat.tree_extents_scrubbed++; - sdev->stat.tree_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); - } else { - blocksize = sdev->sectorsize; - BUG_ON(1); - } while (len) { - u64 l = min_t(u64, len, blocksize); + u64 l = min_t(u64, len, PAGE_SIZE); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -1740,8 +1104,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (have_csum == 0) ++sdev->stat.no_csum; } - ret = scrub_pages(sdev, logical, l, physical, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + ret = scrub_page(sdev, logical, l, physical, flags, gen, + mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; len -= l; @@ -1806,11 +1170,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; - /* - * work on commit root. The related disk blocks are static as - * long as COW is applied. This means, it is save to rewrite - * them to repair disk errors without any race conditions - */ path->search_commit_root = 1; path->skip_locking = 1; @@ -2161,11 +1520,11 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) break; - ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, + BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } @@ -2224,30 +1583,10 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, /* * check some assumptions */ - if (root->nodesize != root->leafsize) { - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", - root->nodesize, root->leafsize); - return -EINVAL; - } - - if (root->nodesize > BTRFS_STRIPE_LEN) { - /* - * in this case scrub is unable to calculate the checksum - * the way scrub is implemented. Do not handle this - * situation at all because it won't ever happen. - */ - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", - root->nodesize, BTRFS_STRIPE_LEN); - return -EINVAL; - } - - if (root->sectorsize != PAGE_SIZE) { - /* not supported for data w/o checksums */ - printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - root->sectorsize, (unsigned long long)PAGE_SIZE); + if (root->sectorsize != PAGE_SIZE || + root->sectorsize != root->leafsize || + root->sectorsize != root->nodesize) { + printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); return -EINVAL; } @@ -2402,7 +1741,6 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) return 0; } - int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) { struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/trunk/fs/btrfs/struct-funcs.c b/trunk/fs/btrfs/struct-funcs.c index c6ffa5812419..bc1f6ad18442 100644 --- a/trunk/fs/btrfs/struct-funcs.c +++ b/trunk/fs/btrfs/struct-funcs.c @@ -44,9 +44,8 @@ #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ -u##bits btrfs_token_##name(struct extent_buffer *eb, \ - type *s, struct btrfs_map_token *token) \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -55,18 +54,9 @@ u##bits btrfs_token_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ - unsigned long mem_len = sizeof(((type *)0)->member); \ u##bits res; \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ - kaddr = token->kaddr; \ - p = (type *)(kaddr + part_offset - token->offset); \ - res = le##bits##_to_cpu(p->member); \ - return res; \ - } \ err = map_private_extent_buffer(eb, offset, \ - mem_len, \ + sizeof(((type *)0)->member), \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits leres; \ @@ -75,15 +65,10 @@ u##bits btrfs_token_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ res = le##bits##_to_cpu(p->member); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ - } \ return res; \ } \ -void btrfs_set_token_##name(struct extent_buffer *eb, \ - type *s, u##bits val, struct btrfs_map_token *token) \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -92,17 +77,8 @@ void btrfs_set_token_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ - unsigned long mem_len = sizeof(((type *)0)->member); \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ - kaddr = token->kaddr; \ - p = (type *)(kaddr + part_offset - token->offset); \ - p->member = cpu_to_le##bits(val); \ - return; \ - } \ err = map_private_extent_buffer(eb, offset, \ - mem_len, \ + sizeof(((type *)0)->member), \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits val2; \ @@ -112,22 +88,7 @@ void btrfs_set_token_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ p->member = cpu_to_le##bits(val); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ - } \ -} \ -void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ -{ \ - btrfs_set_token_##name(eb, s, val, NULL); \ -} \ -u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ -{ \ - return btrfs_token_##name(eb, s, NULL); \ -} \ +} #include "ctree.h" diff --git a/trunk/fs/btrfs/super.c b/trunk/fs/btrfs/super.c index 3ce97b217cbe..9774e38a0532 100644 --- a/trunk/fs/btrfs/super.c +++ b/trunk/fs/btrfs/super.c @@ -76,6 +76,9 @@ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, case -EROFS: errstr = "Readonly filesystem"; break; + case -EEXIST: + errstr = "Object already exists"; + break; default: if (nbuf) { if (snprintf(nbuf, 16, "error %d", -errno) >= 0) @@ -145,6 +148,36 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, btrfs_handle_error(fs_info); } +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, + * issues an alert, and either panics or BUGs, depending on mount options. + */ +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char nbuf[16]; + char *s_id = ""; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", + s_id, function, line, &vaf, errstr); + + printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", + s_id, function, line, &vaf, errstr); + va_end(args); + /* Caller calls BUG() */ +} + static void btrfs_put_super(struct super_block *sb) { (void)close_ctree(btrfs_sb(sb)->tree_root); @@ -166,7 +199,7 @@ enum { Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, + Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_err, }; @@ -206,6 +239,7 @@ static match_table_t tokens = { {Opt_check_integrity, "check_int"}, {Opt_check_integrity_including_extent_data, "check_int_data"}, {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, + {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_err, NULL}, }; @@ -438,6 +472,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) ret = -EINVAL; goto out; #endif + case Opt_fatal_errors: + if (strcmp(args[0].from, "panic") == 0) + btrfs_set_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else if (strcmp(args[0].from, "bug") == 0) + btrfs_clear_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else { + ret = -EINVAL; + goto out; + } + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -766,6 +812,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(root, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); + if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + seq_puts(seq, ",fatal_errors=panic"); return 0; } diff --git a/trunk/fs/btrfs/volumes.c b/trunk/fs/btrfs/volumes.c index 58aad63e1ad3..ef41f285a475 100644 --- a/trunk/fs/btrfs/volumes.c +++ b/trunk/fs/btrfs/volumes.c @@ -4384,7 +4384,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) * to silence the warning eg. on PowerPC 64. */ if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->pages[0]); + SetPageUptodate(sb->first_page); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy);