Skip to content

Commit

Permalink
Btrfs: RAID5 and RAID6
Browse files Browse the repository at this point in the history
This builds on David Woodhouse's original Btrfs raid5/6 implementation.
The code has changed quite a bit, blame Chris Mason for any bugs.

Read/modify/write is done after the higher levels of the filesystem have
prepared a given bio.  This means the higher layers are not responsible
for building full stripes, and they don't need to query for the topology
of the extents that may get allocated during delayed allocation runs.
It also means different files can easily share the same stripe.

But, it does expose us to incorrect parity if we crash or lose power
while doing a read/modify/write cycle.  This will be addressed in a
later commit.

Scrub is unable to repair crc errors on raid5/6 chunks.

Discard does not work on raid5/6 (yet)

The stripe size is fixed at 64KiB per disk.  This will be tunable
in a later commit.

Signed-off-by: Chris Mason <chris.mason@fusionio.com>
  • Loading branch information
David Woodhouse authored and Chris Mason committed Feb 1, 2013
1 parent 64a1670 commit 53b381b
Show file tree
Hide file tree
Showing 15 changed files with 2,283 additions and 102 deletions.
2 changes: 2 additions & 0 deletions fs/btrfs/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ config BTRFS_FS
select ZLIB_DEFLATE
select LZO_COMPRESS
select LZO_DECOMPRESS
select RAID6_PQ

help
Btrfs is a new filesystem with extents, writable snapshotting,
support for multiple devices and many more features.
Expand Down
2 changes: 1 addition & 1 deletion fs/btrfs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o

btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
35 changes: 34 additions & 1 deletion fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)

#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)

#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
Expand All @@ -511,6 +512,7 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)

/*
Expand Down Expand Up @@ -952,15 +954,19 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
#define BTRFS_NR_RAID_TYPES 5
#define BTRFS_NR_RAID_TYPES 7

#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
BTRFS_BLOCK_GROUP_METADATA)

#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID1 | \
BTRFS_BLOCK_GROUP_RAID5 | \
BTRFS_BLOCK_GROUP_RAID6 | \
BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID10)
/*
Expand Down Expand Up @@ -1185,6 +1191,10 @@ struct btrfs_block_group_cache {
u64 flags;
u64 sectorsize;
u64 cache_generation;

/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;

unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
Expand Down Expand Up @@ -1225,6 +1235,20 @@ struct seq_list {
u64 seq;
};

/* used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
struct list_head hash_list;
wait_queue_head_t wait;
spinlock_t lock;
};

/* used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash_table {
struct btrfs_stripe_hash *table;
};

#define BTRFS_STRIPE_HASH_TABLE_BITS 11

/* fs_info */
struct reloc_control;
struct btrfs_device;
Expand Down Expand Up @@ -1307,6 +1331,13 @@ struct btrfs_fs_info {
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;

/* this is used during read/modify/write to make sure
* no two ios are trying to mod the same stripe at the same
* time
*/
struct btrfs_stripe_hash_table *stripe_hash_table;

/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
Expand Down Expand Up @@ -1395,6 +1426,8 @@ struct btrfs_fs_info {
struct btrfs_workers flush_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_meta_workers;
struct btrfs_workers endio_raid56_workers;
struct btrfs_workers rmw_workers;
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers endio_freespace_worker;
Expand Down
62 changes: 53 additions & 9 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"

#ifdef CONFIG_X86
#include <asm/cpufeature.h>
Expand Down Expand Up @@ -639,8 +640,15 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
btree_readahead_hook(root, eb, eb->start, ret);
}

if (ret)
if (ret) {
/*
* our io error hook is going to dec the io pages
* again, we have to make sure it has something
* to decrement
*/
atomic_inc(&eb->io_pages);
clear_extent_buffer_uptodate(eb);
}
free_extent_buffer(eb);
out:
return ret;
Expand All @@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
eb = (struct extent_buffer *)page->private;
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, -EIO);
return -EIO; /* we fixed nothing */
Expand All @@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
end_io_wq->work.flags = 0;

if (bio->bi_rw & REQ_WRITE) {
if (end_io_wq->metadata == 1)
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
btrfs_queue_worker(&fs_info->endio_meta_write_workers,
&end_io_wq->work);
else if (end_io_wq->metadata == 2)
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
btrfs_queue_worker(&fs_info->endio_freespace_worker,
&end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
btrfs_queue_worker(&fs_info->endio_raid56_workers,
&end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_write_workers,
&end_io_wq->work);
} else {
if (end_io_wq->metadata)
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
btrfs_queue_worker(&fs_info->endio_raid56_workers,
&end_io_wq->work);
else if (end_io_wq->metadata)
btrfs_queue_worker(&fs_info->endio_meta_workers,
&end_io_wq->work);
else
Expand All @@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
* 0 - if data
* 1 - if normal metadta
* 2 - if writing to the free space cache area
* 3 - raid parity work
*/
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata)
Expand Down Expand Up @@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);

ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = -ENOMEM;
goto fail_alloc;
}

__setup_root(4096, 4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);

Expand Down Expand Up @@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->endio_meta_write_workers,
"endio-meta-write", fs_info->thread_pool_size,
&fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_raid56_workers,
"endio-raid56", fs_info->thread_pool_size,
&fs_info->generic_worker);
btrfs_init_workers(&fs_info->rmw_workers,
"rmw", fs_info->thread_pool_size,
&fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
fs_info->thread_pool_size,
&fs_info->generic_worker);
Expand All @@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb,
*/
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_meta_workers.idle_thresh = 4;
fs_info->endio_raid56_workers.idle_thresh = 4;
fs_info->rmw_workers.idle_thresh = 2;

fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
Expand All @@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb,
ret |= btrfs_start_workers(&fs_info->fixup_workers);
ret |= btrfs_start_workers(&fs_info->endio_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
ret |= btrfs_start_workers(&fs_info->rmw_workers);
ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
Expand Down Expand Up @@ -2710,6 +2742,8 @@ int open_ctree(struct super_block *sb,
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
btrfs_stop_workers(&fs_info->endio_raid56_workers);
btrfs_stop_workers(&fs_info->rmw_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
Expand All @@ -2728,6 +2762,7 @@ int open_ctree(struct super_block *sb,
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;

Expand Down Expand Up @@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
== 0)))
num_tolerated_disk_barrier_failures = 0;
else if (num_tolerated_disk_barrier_failures > 1
&&
(flags & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10)))
num_tolerated_disk_barrier_failures = 1;
else if (num_tolerated_disk_barrier_failures > 1) {
if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID10)) {
num_tolerated_disk_barrier_failures = 1;
} else if (flags &
BTRFS_BLOCK_GROUP_RAID5) {
num_tolerated_disk_barrier_failures = 2;
}
}
}
}
up_read(&sinfo->groups_sem);
Expand Down Expand Up @@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
btrfs_stop_workers(&fs_info->endio_raid56_workers);
btrfs_stop_workers(&fs_info->rmw_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
Expand All @@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root)
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);

btrfs_free_stripe_hash_table(fs_info);

return 0;
}

Expand Down
7 changes: 7 additions & 0 deletions fs/btrfs/disk-io.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12

enum {
BTRFS_WQ_ENDIO_DATA = 0,
BTRFS_WQ_ENDIO_METADATA = 1,
BTRFS_WQ_ENDIO_FREE_SPACE = 2,
BTRFS_WQ_ENDIO_RAID56 = 3,
};

static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = 16 * 1024;
Expand Down
Loading

0 comments on commit 53b381b

Please sign in to comment.