Skip to content

Commit

Permalink
gfs2: Fix mmap + page fault deadlocks for direct I/O
Browse files Browse the repository at this point in the history
Also disable page faults during direct I/O requests and implement a
similar kind of retry logic as in the buffered I/O case.

The retry logic in the direct I/O case differs from the buffered I/O
case in the following way: direct I/O doesn't provide the kinds of
consistency guarantees between concurrent reads and writes that buffered
I/O provides, so once we lose the inode glock while faulting in user
pages, we always resume the operation.  We never need to return a
partial read or write.

This locking problem was originally reported by Jan Kara.  Linus came up
with the idea of disabling page faults.  Many thanks to Al Viro and
Matthew Wilcox for their feedback.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
  • Loading branch information
Andreas Gruenbacher committed Oct 25, 2021
1 parent 3337ab0 commit b01b2d7
Showing 1 changed file with 87 additions and 12 deletions.
99 changes: 87 additions & 12 deletions fs/gfs2/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -811,22 +811,64 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
size_t count = iov_iter_count(to);
size_t prev_count = 0, window_size = 0;
size_t written = 0;
ssize_t ret;

if (!count)
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
* that the inode glock may be dropped, fault in the pages manually,
* and retry.
*
* Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
* physical as well as manual page faults, and we need to disable both
* kinds.
*
* For direct I/O, gfs2 takes the inode glock in deferred mode. This
* locking mode is compatible with other deferred holders, so multiple
* processes and nodes can do direct I/O to a file at the same time.
* There's no guarantee that reads or writes will be atomic. Any
* coordination among readers and writers needs to happen externally.
*/

if (!iov_iter_count(to))
return 0; /* skip atime */

gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
retry_under_glock:
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
IOMAP_DIO_PARTIAL, written);
to->nofault = false;
pagefault_enable();
if (ret > 0)
written = ret;

ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0);
gfs2_glock_dq(gh);
if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
size_t leftover;

gfs2_holder_allow_demote(gh);
leftover = fault_in_iov_iter_writeable(to, window_size);
gfs2_holder_disallow_demote(gh);
if (leftover != window_size) {
if (!gfs2_holder_queued(gh))
goto retry;
goto retry_under_glock;
}
}
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
return ret;
if (ret < 0)
return ret;
return written;
}

static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
Expand All @@ -835,10 +877,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
size_t len = iov_iter_count(from);
loff_t offset = iocb->ki_pos;
size_t prev_count = 0, window_size = 0;
size_t read = 0;
ssize_t ret;

/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
* that the inode glock may be dropped, fault in the pages manually,
* and retry.
*
* For writes, iomap_dio_rw only triggers manual page faults, so we
* don't need to disable physical ones.
*/

/*
* Deferred lock, even if its a write, since we do no allocation on
* this path. All we need to change is the atime, and this lock mode
Expand All @@ -848,22 +900,45 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
* VFS does.
*/
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;

retry_under_glock:
/* Silently fall back to buffered I/O when writing beyond EOF */
if (offset + len > i_size_read(&ip->i_inode))
if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
goto out;

ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0);
from->nofault = true;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
IOMAP_DIO_PARTIAL, read);
from->nofault = false;

if (ret == -ENOTBLK)
ret = 0;
if (ret > 0)
read = ret;

if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
size_t leftover;

gfs2_holder_allow_demote(gh);
leftover = fault_in_iov_iter_readable(from, window_size);
gfs2_holder_disallow_demote(gh);
if (leftover != window_size) {
if (!gfs2_holder_queued(gh))
goto retry;
goto retry_under_glock;
}
}
out:
gfs2_glock_dq(gh);
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
return ret;
if (ret < 0)
return ret;
return read;
}

static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
Expand Down

0 comments on commit b01b2d7

Please sign in to comment.