Skip to content

Commit

Permalink
ext4: completed_io locking cleanup
Browse files Browse the repository at this point in the history
Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
  My diagnosis:
  We already protect extent tree with i_data_sem, truncate and punch_hole
  should wait for DIO, so the only data we have to protect is end_io->flags
  modification, but only flush_completed_IO and end_io_work modified this
  flags and we can serialize them via i_completed_io_lock.

  Currently all these games with mutex_trylock result in the following deadlock
   truncate:                          kworker:
    ext4_setattr                       ext4_end_io_work
    mutex_lock(i_mutex)
    inode_dio_wait(inode)  ->BLOCK
                             DEADLOCK<- mutex_trylock()
                                        inode_dio_done()
  #TEST_CASE1_BEGIN
  MNT=/mnt_scrach
  unlink $MNT/file
  fallocate -l $((1024*1024*1024)) $MNT/file
  aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
  sleep 2
  truncate -s 0 $MNT/file
  #TEST_CASE1_END

Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286

This patch makes state machine simple and clean:

(1) xxx_end_io schedule final extent conversion simply by calling
    ext4_add_complete_io(), which append it to ei->i_completed_io_list
    NOTE1: because of (2A) work should be queued only if
    ->i_completed_io_list was empty, otherwise the work is scheduled already.

(2) ext4_flush_completed_IO is responsible for handling all pending
    end_io from ei->i_completed_io_list
    Flushing sequence consists of following stages:
    A) LOCKED: Atomically drain completed_io_list to local_list
    B) Perform extents conversion
    C) LOCKED: move converted io's to to_free list for final deletion
       	     This logic depends on context which we was called from.
    D) Final end_io context destruction
    NOTE1: i_mutex is no longer required because end_io->flags modification
    is protected by ei->ext4_complete_io_lock

Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
  logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
  protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
  not good idea to punch blocks from corrupted inode.

Changes since V3 (in request to Jan's comments):
  Fall back to active flush_completed_IO() approach in order to prevent
  performance issues with nolocked DIO reads.
Changes since V2:
  Fix use-after-free caused by race truncate vs end_io_work

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  • Loading branch information
Dmitry Monakhov authored and Theodore Ts'o committed Sep 29, 2012
1 parent 82e5422 commit 28a535f
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 169 deletions.
3 changes: 1 addition & 2 deletions fs/ext4/ext4.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
#define EXT4_IO_END_IN_FSYNC 0x0010

struct ext4_io_page {
struct page *p_page;
Expand Down Expand Up @@ -2418,11 +2417,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
Expand Down
4 changes: 3 additions & 1 deletion fs/ext4/extents.c
Original file line number Diff line number Diff line change
Expand Up @@ -4833,7 +4833,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
}

/* finish any pending end_io work */
ext4_flush_completed_IO(inode);
err = ext4_flush_completed_IO(inode);
if (err)
return err;

credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
Expand Down
81 changes: 0 additions & 81 deletions fs/ext4/fsync.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,87 +34,6 @@

#include <trace/events/ext4.h>

static void dump_completed_IO(struct inode * inode)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
unsigned long flags;

if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
return;
}

ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
io1 = container_of(after, ext4_io_end_t, list);

ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
io, inode->i_ino, io0, io1);
}
spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
#endif
}

/*
* This function is called from ext4_sync_file().
*
* When IO is completed, the work to convert unwritten extents to
* written is queued on workqueue but may not get immediately
* scheduled. When fsync is called, we need to ensure the
* conversion is complete before fsync returns.
* The inode keeps track of a list of pending/completed IO that
* might needs to do the conversion. This function walks through
* the list and convert the related unwritten extents for completed IO
* to written.
* The function return the number of pending IOs on success.
*/
int ext4_flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long flags;
int ret = 0;
int ret2 = 0;

dump_completed_IO(inode);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
while (!list_empty(&ei->i_completed_io_list)){
io = list_entry(ei->i_completed_io_list.next,
ext4_io_end_t, list);
list_del_init(&io->list);
io->flag |= EXT4_IO_END_IN_FSYNC;
/*
* Calling ext4_end_io_nolock() to convert completed
* IO to written.
*
* When ext4_sync_file() is called, run_queue() may already
* about to flush the work corresponding to this io structure.
* It will be upset if it founds the io structure related
* to the work-to-be schedule is freed.
*
* Thus we need to keep the io structure still valid here after
* conversion finished. The io structure has a flag to
* avoid double converting from both fsync and background work
* queue work.
*/
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
ret = ext4_end_io_nolock(io);
if (ret < 0)
ret2 = ret;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
io->flag &= ~EXT4_IO_END_IN_FSYNC;
}
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
return (ret2 < 0) ? ret2 : 0;
}

/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
Expand Down
6 changes: 2 additions & 4 deletions fs/ext4/indirect.c
Original file line number Diff line number Diff line change
Expand Up @@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,

retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
if (unlikely(!list_empty(&ei->i_completed_io_list))) {
mutex_lock(&inode->i_mutex);
if (unlikely(!list_empty(&ei->i_completed_io_list)))
ext4_flush_completed_IO(inode);
mutex_unlock(&inode->i_mutex);
}

ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
Expand Down
25 changes: 2 additions & 23 deletions fs/ext4/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2881,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;
unsigned long flags;
struct ext4_inode_info *ei;

/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
Expand Down Expand Up @@ -2912,24 +2909,14 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
io_end->iocb = iocb;
io_end->result = ret;
}
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;

/* Add the io_end to per-inode completed aio dio list*/
ei = EXT4_I(io_end->inode);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);

/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
ext4_add_complete_io(io_end);
}

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
struct workqueue_struct *wq;
struct inode *inode;
unsigned long flags;

if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
Expand All @@ -2948,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);

/* Add the io_end to per-inode completed io list*/
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);

wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
Expand Down
Loading

0 comments on commit 28a535f

Please sign in to comment.