Skip to content

Commit

Permalink
ext4: use ext4_get_block_write in buffer write
Browse files Browse the repository at this point in the history
Allocate uninitialized extent before ext4 buffer write and
convert the extent to initialized after io completes.
The purpose is to make sure an extent can only be marked
initialized after it has been written with new data so
we can safely drop the i_mutex lock in ext4 DIO read without
exposing stale data. This helps to improve multi-thread DIO
read performance on high-speed disks.

Skip the nobh and data=journal mount cases to make things simple for now.

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  • Loading branch information
Jiaying Zhang authored and Theodore Ts'o committed Mar 4, 2010
1 parent c7064ef commit 744692d
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 55 deletions.
15 changes: 13 additions & 2 deletions fs/ext4/ext4.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ typedef struct ext4_io_end {
struct list_head list; /* per-file finished AIO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
int error; /* I/O error code */
struct page *page; /* page struct for buffer write */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
struct work_struct work; /* data work queue */
Expand Down Expand Up @@ -361,7 +361,7 @@ struct ext4_new_group_data {
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_IO_CREATE_EXT)
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)

/*
* Flags used by ext4_free_blocks
Expand Down Expand Up @@ -702,6 +702,7 @@ struct ext4_inode_info {

/* completed IOs that might need unwritten extents handling */
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;

Expand Down Expand Up @@ -752,6 +753,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
Expand Down Expand Up @@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 len, __u64 *moved_len);


/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
= BH_JBDPrivateStart,
};

BUFFER_FNS(Uninit, uninit)
TAS_BUFFER_FNS(Uninit, uninit)

/*
* Add new method to test wether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
Expand Down
24 changes: 24 additions & 0 deletions fs/ext4/ext4_jbd2.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0;
}

/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
* i_mutex for direct I/O reads. This only works for extent-based
* files, and it doesn't work for nobh or if data journaling is
* enabled, since the dioread_nolock code uses b_private to pass
* information back to the I/O completion handler, and this conflicts
* with the jbd's use of b_private.
*/
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
return 0;
if (test_opt(inode->i_sb, NOBH))
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
return 0;
if (ext4_should_journal_data(inode))
return 0;
return 1;
}

#endif /* _EXT4_JBD2_H */
22 changes: 12 additions & 10 deletions fs/ext4/extents.c
Original file line number Diff line number Diff line change
Expand Up @@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL);

/* try to insert block into found extent and return */
if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO)
if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
&& ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
Expand Down Expand Up @@ -1740,7 +1740,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,

merge:
/* try to merge extents to the right */
if (flag != EXT4_GET_BLOCKS_PRE_IO)
if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(inode, path, nearex);

/* try to merge extents to the left */
Expand Down Expand Up @@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext4_ext_show_leaf(inode, path);

/* get_block() before submit the IO, split the extent */
if (flags == EXT4_GET_BLOCKS_PRE_IO) {
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle,
inode, path, iblock,
max_blocks, flags);
Expand All @@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
io->flag = EXT4_IO_UNWRITTEN;
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
set_buffer_uninit(bh_result);
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags == EXT4_GET_BLOCKS_CONVERT) {
if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
ret = ext4_convert_unwritten_extents_endio(handle, inode,
path);
if (ret >= 0)
Expand Down Expand Up @@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
/*
* io_end structure was created for every async
* direct IO write to the middle of the file.
* To avoid unecessary convertion for every aio dio rewrite
* to the mid of file, here we flag the IO that is really
* need the convertion.
* io_end structure was created for every IO write to an
* uninitialized extent. To avoid unecessary conversion,
* here we flag the IO that really needs the conversion.
* For non asycn direct IO case, flag the inode state
* that we need to perform convertion when IO is done.
*/
if (flags == EXT4_GET_BLOCKS_PRE_IO) {
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
if (io)
io->flag = EXT4_IO_UNWRITTEN;
else
ext4_set_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN);
}
if (ext4_should_dioread_nolock(inode))
set_buffer_uninit(bh_result);
}

if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
Expand Down
Loading

0 comments on commit 744692d

Please sign in to comment.